mirror of
https://github.com/python/cpython.git
synced 2025-09-27 02:39:58 +00:00
bpo-31699 Deadlocks in concurrent.futures.ProcessPoolExecutor
with pickling error (#3895)
Fix deadlocks in :class:`concurrent.futures.ProcessPoolExecutor` when task arguments or results cause pickling or unpickling errors. This should make sure that calls to the :class:`ProcessPoolExecutor` API always eventually return.
This commit is contained in:
parent
65f2a6dcc2
commit
94459fd7dc
5 changed files with 387 additions and 57 deletions
|
@ -8,10 +8,10 @@ The follow diagram and text describe the data-flow through the system:
|
||||||
|======================= In-process =====================|== Out-of-process ==|
|
|======================= In-process =====================|== Out-of-process ==|
|
||||||
|
|
||||||
+----------+ +----------+ +--------+ +-----------+ +---------+
|
+----------+ +----------+ +--------+ +-----------+ +---------+
|
||||||
| | => | Work Ids | => | | => | Call Q | => | |
|
| | => | Work Ids | | | | Call Q | | Process |
|
||||||
| | +----------+ | | +-----------+ | |
|
| | +----------+ | | +-----------+ | Pool |
|
||||||
| | | ... | | | | ... | | |
|
| | | ... | | | | ... | +---------+
|
||||||
| | | 6 | | | | 5, call() | | |
|
| | | 6 | => | | => | 5, call() | => | |
|
||||||
| | | 7 | | | | ... | | |
|
| | | 7 | | | | ... | | |
|
||||||
| Process | | ... | | Local | +-----------+ | Process |
|
| Process | | ... | | Local | +-----------+ | Process |
|
||||||
| Pool | +----------+ | Worker | | #1..n |
|
| Pool | +----------+ | Worker | | #1..n |
|
||||||
|
@ -52,6 +52,7 @@ import queue
|
||||||
from queue import Full
|
from queue import Full
|
||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
from multiprocessing.connection import wait
|
from multiprocessing.connection import wait
|
||||||
|
from multiprocessing.queues import Queue
|
||||||
import threading
|
import threading
|
||||||
import weakref
|
import weakref
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
@ -72,16 +73,31 @@ import traceback
|
||||||
# workers to exit when their work queues are empty and then waits until the
|
# workers to exit when their work queues are empty and then waits until the
|
||||||
# threads/processes finish.
|
# threads/processes finish.
|
||||||
|
|
||||||
_threads_queues = weakref.WeakKeyDictionary()
|
_threads_wakeups = weakref.WeakKeyDictionary()
|
||||||
_global_shutdown = False
|
_global_shutdown = False
|
||||||
|
|
||||||
|
|
||||||
|
class _ThreadWakeup:
|
||||||
|
__slot__ = ["_state"]
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._reader, self._writer = mp.Pipe(duplex=False)
|
||||||
|
|
||||||
|
def wakeup(self):
|
||||||
|
self._writer.send_bytes(b"")
|
||||||
|
|
||||||
|
def clear(self):
|
||||||
|
while self._reader.poll():
|
||||||
|
self._reader.recv_bytes()
|
||||||
|
|
||||||
|
|
||||||
def _python_exit():
|
def _python_exit():
|
||||||
global _global_shutdown
|
global _global_shutdown
|
||||||
_global_shutdown = True
|
_global_shutdown = True
|
||||||
items = list(_threads_queues.items())
|
items = list(_threads_wakeups.items())
|
||||||
for t, q in items:
|
for _, thread_wakeup in items:
|
||||||
q.put(None)
|
thread_wakeup.wakeup()
|
||||||
for t, q in items:
|
for t, _ in items:
|
||||||
t.join()
|
t.join()
|
||||||
|
|
||||||
# Controls how many more calls than processes will be queued in the call queue.
|
# Controls how many more calls than processes will be queued in the call queue.
|
||||||
|
@ -90,6 +106,7 @@ def _python_exit():
|
||||||
# (Futures in the call queue cannot be cancelled).
|
# (Futures in the call queue cannot be cancelled).
|
||||||
EXTRA_QUEUED_CALLS = 1
|
EXTRA_QUEUED_CALLS = 1
|
||||||
|
|
||||||
|
|
||||||
# Hack to embed stringification of remote traceback in local traceback
|
# Hack to embed stringification of remote traceback in local traceback
|
||||||
|
|
||||||
class _RemoteTraceback(Exception):
|
class _RemoteTraceback(Exception):
|
||||||
|
@ -132,6 +149,25 @@ class _CallItem(object):
|
||||||
self.kwargs = kwargs
|
self.kwargs = kwargs
|
||||||
|
|
||||||
|
|
||||||
|
class _SafeQueue(Queue):
|
||||||
|
"""Safe Queue set exception to the future object linked to a job"""
|
||||||
|
def __init__(self, max_size=0, *, ctx, pending_work_items):
|
||||||
|
self.pending_work_items = pending_work_items
|
||||||
|
super().__init__(max_size, ctx=ctx)
|
||||||
|
|
||||||
|
def _on_queue_feeder_error(self, e, obj):
|
||||||
|
if isinstance(obj, _CallItem):
|
||||||
|
tb = traceback.format_exception(type(e), e, e.__traceback__)
|
||||||
|
e.__cause__ = _RemoteTraceback('\n"""\n{}"""'.format(''.join(tb)))
|
||||||
|
work_item = self.pending_work_items.pop(obj.work_id, None)
|
||||||
|
# work_item can be None if another process terminated. In this case,
|
||||||
|
# the queue_manager_thread fails all work_items with BrokenProcessPool
|
||||||
|
if work_item is not None:
|
||||||
|
work_item.future.set_exception(e)
|
||||||
|
else:
|
||||||
|
super()._on_queue_feeder_error(e, obj)
|
||||||
|
|
||||||
|
|
||||||
def _get_chunks(*iterables, chunksize):
|
def _get_chunks(*iterables, chunksize):
|
||||||
""" Iterates over zip()ed iterables in chunks. """
|
""" Iterates over zip()ed iterables in chunks. """
|
||||||
it = zip(*iterables)
|
it = zip(*iterables)
|
||||||
|
@ -152,6 +188,17 @@ def _process_chunk(fn, chunk):
|
||||||
"""
|
"""
|
||||||
return [fn(*args) for args in chunk]
|
return [fn(*args) for args in chunk]
|
||||||
|
|
||||||
|
|
||||||
|
def _sendback_result(result_queue, work_id, result=None, exception=None):
|
||||||
|
"""Safely send back the given result or exception"""
|
||||||
|
try:
|
||||||
|
result_queue.put(_ResultItem(work_id, result=result,
|
||||||
|
exception=exception))
|
||||||
|
except BaseException as e:
|
||||||
|
exc = _ExceptionWithTraceback(e, e.__traceback__)
|
||||||
|
result_queue.put(_ResultItem(work_id, exception=exc))
|
||||||
|
|
||||||
|
|
||||||
def _process_worker(call_queue, result_queue, initializer, initargs):
|
def _process_worker(call_queue, result_queue, initializer, initargs):
|
||||||
"""Evaluates calls from call_queue and places the results in result_queue.
|
"""Evaluates calls from call_queue and places the results in result_queue.
|
||||||
|
|
||||||
|
@ -183,10 +230,9 @@ def _process_worker(call_queue, result_queue, initializer, initargs):
|
||||||
r = call_item.fn(*call_item.args, **call_item.kwargs)
|
r = call_item.fn(*call_item.args, **call_item.kwargs)
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
exc = _ExceptionWithTraceback(e, e.__traceback__)
|
exc = _ExceptionWithTraceback(e, e.__traceback__)
|
||||||
result_queue.put(_ResultItem(call_item.work_id, exception=exc))
|
_sendback_result(result_queue, call_item.work_id, exception=exc)
|
||||||
else:
|
else:
|
||||||
result_queue.put(_ResultItem(call_item.work_id,
|
_sendback_result(result_queue, call_item.work_id, result=r)
|
||||||
result=r))
|
|
||||||
|
|
||||||
# Liberate the resource as soon as possible, to avoid holding onto
|
# Liberate the resource as soon as possible, to avoid holding onto
|
||||||
# open files or shared memory that is not needed anymore
|
# open files or shared memory that is not needed anymore
|
||||||
|
@ -230,12 +276,14 @@ def _add_call_item_to_queue(pending_work_items,
|
||||||
del pending_work_items[work_id]
|
del pending_work_items[work_id]
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
||||||
def _queue_management_worker(executor_reference,
|
def _queue_management_worker(executor_reference,
|
||||||
processes,
|
processes,
|
||||||
pending_work_items,
|
pending_work_items,
|
||||||
work_ids_queue,
|
work_ids_queue,
|
||||||
call_queue,
|
call_queue,
|
||||||
result_queue):
|
result_queue,
|
||||||
|
thread_wakeup):
|
||||||
"""Manages the communication between this process and the worker processes.
|
"""Manages the communication between this process and the worker processes.
|
||||||
|
|
||||||
This function is run in a local thread.
|
This function is run in a local thread.
|
||||||
|
@ -253,6 +301,9 @@ def _queue_management_worker(executor_reference,
|
||||||
derived from _WorkItems for processing by the process workers.
|
derived from _WorkItems for processing by the process workers.
|
||||||
result_queue: A ctx.SimpleQueue of _ResultItems generated by the
|
result_queue: A ctx.SimpleQueue of _ResultItems generated by the
|
||||||
process workers.
|
process workers.
|
||||||
|
thread_wakeup: A _ThreadWakeup to allow waking up the
|
||||||
|
queue_manager_thread from the main Thread and avoid deadlocks
|
||||||
|
caused by permanently locked queues.
|
||||||
"""
|
"""
|
||||||
executor = None
|
executor = None
|
||||||
|
|
||||||
|
@ -261,10 +312,21 @@ def _queue_management_worker(executor_reference,
|
||||||
or executor._shutdown_thread)
|
or executor._shutdown_thread)
|
||||||
|
|
||||||
def shutdown_worker():
|
def shutdown_worker():
|
||||||
# This is an upper bound
|
# This is an upper bound on the number of children alive.
|
||||||
nb_children_alive = sum(p.is_alive() for p in processes.values())
|
n_children_alive = sum(p.is_alive() for p in processes.values())
|
||||||
for i in range(0, nb_children_alive):
|
n_children_to_stop = n_children_alive
|
||||||
call_queue.put_nowait(None)
|
n_sentinels_sent = 0
|
||||||
|
# Send the right number of sentinels, to make sure all children are
|
||||||
|
# properly terminated.
|
||||||
|
while n_sentinels_sent < n_children_to_stop and n_children_alive > 0:
|
||||||
|
for i in range(n_children_to_stop - n_sentinels_sent):
|
||||||
|
try:
|
||||||
|
call_queue.put_nowait(None)
|
||||||
|
n_sentinels_sent += 1
|
||||||
|
except Full:
|
||||||
|
break
|
||||||
|
n_children_alive = sum(p.is_alive() for p in processes.values())
|
||||||
|
|
||||||
# Release the queue's resources as soon as possible.
|
# Release the queue's resources as soon as possible.
|
||||||
call_queue.close()
|
call_queue.close()
|
||||||
# If .join() is not called on the created processes then
|
# If .join() is not called on the created processes then
|
||||||
|
@ -272,19 +334,37 @@ def _queue_management_worker(executor_reference,
|
||||||
for p in processes.values():
|
for p in processes.values():
|
||||||
p.join()
|
p.join()
|
||||||
|
|
||||||
reader = result_queue._reader
|
result_reader = result_queue._reader
|
||||||
|
wakeup_reader = thread_wakeup._reader
|
||||||
|
readers = [result_reader, wakeup_reader]
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
_add_call_item_to_queue(pending_work_items,
|
_add_call_item_to_queue(pending_work_items,
|
||||||
work_ids_queue,
|
work_ids_queue,
|
||||||
call_queue)
|
call_queue)
|
||||||
|
|
||||||
sentinels = [p.sentinel for p in processes.values()]
|
# Wait for a result to be ready in the result_queue while checking
|
||||||
assert sentinels
|
# that all worker processes are still running, or for a wake up
|
||||||
ready = wait([reader] + sentinels)
|
# signal send. The wake up signals come either from new tasks being
|
||||||
if reader in ready:
|
# submitted, from the executor being shutdown/gc-ed, or from the
|
||||||
result_item = reader.recv()
|
# shutdown of the python interpreter.
|
||||||
else:
|
worker_sentinels = [p.sentinel for p in processes.values()]
|
||||||
|
ready = wait(readers + worker_sentinels)
|
||||||
|
|
||||||
|
cause = None
|
||||||
|
is_broken = True
|
||||||
|
if result_reader in ready:
|
||||||
|
try:
|
||||||
|
result_item = result_reader.recv()
|
||||||
|
is_broken = False
|
||||||
|
except BaseException as e:
|
||||||
|
cause = traceback.format_exception(type(e), e, e.__traceback__)
|
||||||
|
|
||||||
|
elif wakeup_reader in ready:
|
||||||
|
is_broken = False
|
||||||
|
result_item = None
|
||||||
|
thread_wakeup.clear()
|
||||||
|
if is_broken:
|
||||||
# Mark the process pool broken so that submits fail right now.
|
# Mark the process pool broken so that submits fail right now.
|
||||||
executor = executor_reference()
|
executor = executor_reference()
|
||||||
if executor is not None:
|
if executor is not None:
|
||||||
|
@ -293,14 +373,15 @@ def _queue_management_worker(executor_reference,
|
||||||
'usable anymore')
|
'usable anymore')
|
||||||
executor._shutdown_thread = True
|
executor._shutdown_thread = True
|
||||||
executor = None
|
executor = None
|
||||||
|
bpe = BrokenProcessPool("A process in the process pool was "
|
||||||
|
"terminated abruptly while the future was "
|
||||||
|
"running or pending.")
|
||||||
|
if cause is not None:
|
||||||
|
bpe.__cause__ = _RemoteTraceback(
|
||||||
|
f"\n'''\n{''.join(cause)}'''")
|
||||||
# All futures in flight must be marked failed
|
# All futures in flight must be marked failed
|
||||||
for work_id, work_item in pending_work_items.items():
|
for work_id, work_item in pending_work_items.items():
|
||||||
work_item.future.set_exception(
|
work_item.future.set_exception(bpe)
|
||||||
BrokenProcessPool(
|
|
||||||
"A process in the process pool was "
|
|
||||||
"terminated abruptly while the future was "
|
|
||||||
"running or pending."
|
|
||||||
))
|
|
||||||
# Delete references to object. See issue16284
|
# Delete references to object. See issue16284
|
||||||
del work_item
|
del work_item
|
||||||
pending_work_items.clear()
|
pending_work_items.clear()
|
||||||
|
@ -329,6 +410,9 @@ def _queue_management_worker(executor_reference,
|
||||||
work_item.future.set_result(result_item.result)
|
work_item.future.set_result(result_item.result)
|
||||||
# Delete references to object. See issue16284
|
# Delete references to object. See issue16284
|
||||||
del work_item
|
del work_item
|
||||||
|
# Delete reference to result_item
|
||||||
|
del result_item
|
||||||
|
|
||||||
# Check whether we should start shutting down.
|
# Check whether we should start shutting down.
|
||||||
executor = executor_reference()
|
executor = executor_reference()
|
||||||
# No more work items can be added if:
|
# No more work items can be added if:
|
||||||
|
@ -348,8 +432,11 @@ def _queue_management_worker(executor_reference,
|
||||||
pass
|
pass
|
||||||
executor = None
|
executor = None
|
||||||
|
|
||||||
|
|
||||||
_system_limits_checked = False
|
_system_limits_checked = False
|
||||||
_system_limited = None
|
_system_limited = None
|
||||||
|
|
||||||
|
|
||||||
def _check_system_limits():
|
def _check_system_limits():
|
||||||
global _system_limits_checked, _system_limited
|
global _system_limits_checked, _system_limited
|
||||||
if _system_limits_checked:
|
if _system_limits_checked:
|
||||||
|
@ -369,7 +456,8 @@ def _check_system_limits():
|
||||||
# minimum number of semaphores available
|
# minimum number of semaphores available
|
||||||
# according to POSIX
|
# according to POSIX
|
||||||
return
|
return
|
||||||
_system_limited = "system provides too few semaphores (%d available, 256 necessary)" % nsems_max
|
_system_limited = ("system provides too few semaphores (%d"
|
||||||
|
" available, 256 necessary)" % nsems_max)
|
||||||
raise NotImplementedError(_system_limited)
|
raise NotImplementedError(_system_limited)
|
||||||
|
|
||||||
|
|
||||||
|
@ -415,6 +503,7 @@ class ProcessPoolExecutor(_base.Executor):
|
||||||
raise ValueError("max_workers must be greater than 0")
|
raise ValueError("max_workers must be greater than 0")
|
||||||
|
|
||||||
self._max_workers = max_workers
|
self._max_workers = max_workers
|
||||||
|
|
||||||
if mp_context is None:
|
if mp_context is None:
|
||||||
mp_context = mp.get_context()
|
mp_context = mp.get_context()
|
||||||
self._mp_context = mp_context
|
self._mp_context = mp_context
|
||||||
|
@ -424,18 +513,9 @@ class ProcessPoolExecutor(_base.Executor):
|
||||||
self._initializer = initializer
|
self._initializer = initializer
|
||||||
self._initargs = initargs
|
self._initargs = initargs
|
||||||
|
|
||||||
# Make the call queue slightly larger than the number of processes to
|
# Management thread
|
||||||
# prevent the worker processes from idling. But don't make it too big
|
|
||||||
# because futures in the call queue cannot be cancelled.
|
|
||||||
queue_size = self._max_workers + EXTRA_QUEUED_CALLS
|
|
||||||
self._call_queue = mp_context.Queue(queue_size)
|
|
||||||
# Killed worker processes can produce spurious "broken pipe"
|
|
||||||
# tracebacks in the queue's own worker thread. But we detect killed
|
|
||||||
# processes anyway, so silence the tracebacks.
|
|
||||||
self._call_queue._ignore_epipe = True
|
|
||||||
self._result_queue = mp_context.SimpleQueue()
|
|
||||||
self._work_ids = queue.Queue()
|
|
||||||
self._queue_management_thread = None
|
self._queue_management_thread = None
|
||||||
|
|
||||||
# Map of pids to processes
|
# Map of pids to processes
|
||||||
self._processes = {}
|
self._processes = {}
|
||||||
|
|
||||||
|
@ -446,12 +526,39 @@ class ProcessPoolExecutor(_base.Executor):
|
||||||
self._queue_count = 0
|
self._queue_count = 0
|
||||||
self._pending_work_items = {}
|
self._pending_work_items = {}
|
||||||
|
|
||||||
|
# Create communication channels for the executor
|
||||||
|
# Make the call queue slightly larger than the number of processes to
|
||||||
|
# prevent the worker processes from idling. But don't make it too big
|
||||||
|
# because futures in the call queue cannot be cancelled.
|
||||||
|
queue_size = self._max_workers + EXTRA_QUEUED_CALLS
|
||||||
|
self._call_queue = _SafeQueue(
|
||||||
|
max_size=queue_size, ctx=self._mp_context,
|
||||||
|
pending_work_items=self._pending_work_items)
|
||||||
|
# Killed worker processes can produce spurious "broken pipe"
|
||||||
|
# tracebacks in the queue's own worker thread. But we detect killed
|
||||||
|
# processes anyway, so silence the tracebacks.
|
||||||
|
self._call_queue._ignore_epipe = True
|
||||||
|
self._result_queue = mp_context.SimpleQueue()
|
||||||
|
self._work_ids = queue.Queue()
|
||||||
|
|
||||||
|
# _ThreadWakeup is a communication channel used to interrupt the wait
|
||||||
|
# of the main loop of queue_manager_thread from another thread (e.g.
|
||||||
|
# when calling executor.submit or executor.shutdown). We do not use the
|
||||||
|
# _result_queue to send the wakeup signal to the queue_manager_thread
|
||||||
|
# as it could result in a deadlock if a worker process dies with the
|
||||||
|
# _result_queue write lock still acquired.
|
||||||
|
self._queue_management_thread_wakeup = _ThreadWakeup()
|
||||||
|
|
||||||
def _start_queue_management_thread(self):
|
def _start_queue_management_thread(self):
|
||||||
# When the executor gets lost, the weakref callback will wake up
|
|
||||||
# the queue management thread.
|
|
||||||
def weakref_cb(_, q=self._result_queue):
|
|
||||||
q.put(None)
|
|
||||||
if self._queue_management_thread is None:
|
if self._queue_management_thread is None:
|
||||||
|
# When the executor gets garbarge collected, the weakref callback
|
||||||
|
# will wake up the queue management thread so that it can terminate
|
||||||
|
# if there is no pending work item.
|
||||||
|
def weakref_cb(_,
|
||||||
|
thread_wakeup=self._queue_management_thread_wakeup):
|
||||||
|
mp.util.debug('Executor collected: triggering callback for'
|
||||||
|
' QueueManager wakeup')
|
||||||
|
thread_wakeup.wakeup()
|
||||||
# Start the processes so that their sentinels are known.
|
# Start the processes so that their sentinels are known.
|
||||||
self._adjust_process_count()
|
self._adjust_process_count()
|
||||||
self._queue_management_thread = threading.Thread(
|
self._queue_management_thread = threading.Thread(
|
||||||
|
@ -461,10 +568,13 @@ class ProcessPoolExecutor(_base.Executor):
|
||||||
self._pending_work_items,
|
self._pending_work_items,
|
||||||
self._work_ids,
|
self._work_ids,
|
||||||
self._call_queue,
|
self._call_queue,
|
||||||
self._result_queue))
|
self._result_queue,
|
||||||
|
self._queue_management_thread_wakeup),
|
||||||
|
name="QueueManagerThread")
|
||||||
self._queue_management_thread.daemon = True
|
self._queue_management_thread.daemon = True
|
||||||
self._queue_management_thread.start()
|
self._queue_management_thread.start()
|
||||||
_threads_queues[self._queue_management_thread] = self._result_queue
|
_threads_wakeups[self._queue_management_thread] = \
|
||||||
|
self._queue_management_thread_wakeup
|
||||||
|
|
||||||
def _adjust_process_count(self):
|
def _adjust_process_count(self):
|
||||||
for _ in range(len(self._processes), self._max_workers):
|
for _ in range(len(self._processes), self._max_workers):
|
||||||
|
@ -491,7 +601,7 @@ class ProcessPoolExecutor(_base.Executor):
|
||||||
self._work_ids.put(self._queue_count)
|
self._work_ids.put(self._queue_count)
|
||||||
self._queue_count += 1
|
self._queue_count += 1
|
||||||
# Wake up queue management thread
|
# Wake up queue management thread
|
||||||
self._result_queue.put(None)
|
self._queue_management_thread_wakeup.wakeup()
|
||||||
|
|
||||||
self._start_queue_management_thread()
|
self._start_queue_management_thread()
|
||||||
return f
|
return f
|
||||||
|
@ -531,7 +641,7 @@ class ProcessPoolExecutor(_base.Executor):
|
||||||
self._shutdown_thread = True
|
self._shutdown_thread = True
|
||||||
if self._queue_management_thread:
|
if self._queue_management_thread:
|
||||||
# Wake up queue management thread
|
# Wake up queue management thread
|
||||||
self._result_queue.put(None)
|
self._queue_management_thread_wakeup.wakeup()
|
||||||
if wait:
|
if wait:
|
||||||
self._queue_management_thread.join()
|
self._queue_management_thread.join()
|
||||||
# To reduce the risk of opening too many files, remove references to
|
# To reduce the risk of opening too many files, remove references to
|
||||||
|
|
|
@ -160,9 +160,10 @@ class Queue(object):
|
||||||
self._thread = threading.Thread(
|
self._thread = threading.Thread(
|
||||||
target=Queue._feed,
|
target=Queue._feed,
|
||||||
args=(self._buffer, self._notempty, self._send_bytes,
|
args=(self._buffer, self._notempty, self._send_bytes,
|
||||||
self._wlock, self._writer.close, self._ignore_epipe),
|
self._wlock, self._writer.close, self._ignore_epipe,
|
||||||
|
self._on_queue_feeder_error),
|
||||||
name='QueueFeederThread'
|
name='QueueFeederThread'
|
||||||
)
|
)
|
||||||
self._thread.daemon = True
|
self._thread.daemon = True
|
||||||
|
|
||||||
debug('doing self._thread.start()')
|
debug('doing self._thread.start()')
|
||||||
|
@ -201,7 +202,8 @@ class Queue(object):
|
||||||
notempty.notify()
|
notempty.notify()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _feed(buffer, notempty, send_bytes, writelock, close, ignore_epipe):
|
def _feed(buffer, notempty, send_bytes, writelock, close, ignore_epipe,
|
||||||
|
onerror):
|
||||||
debug('starting thread to feed data to pipe')
|
debug('starting thread to feed data to pipe')
|
||||||
nacquire = notempty.acquire
|
nacquire = notempty.acquire
|
||||||
nrelease = notempty.release
|
nrelease = notempty.release
|
||||||
|
@ -253,8 +255,17 @@ class Queue(object):
|
||||||
info('error in queue thread: %s', e)
|
info('error in queue thread: %s', e)
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
import traceback
|
onerror(e, obj)
|
||||||
traceback.print_exc()
|
|
||||||
|
@staticmethod
|
||||||
|
def _on_queue_feeder_error(e, obj):
|
||||||
|
"""
|
||||||
|
Private API hook called when feeding data in the background thread
|
||||||
|
raises an exception. For overriding by concurrent.futures.
|
||||||
|
"""
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
|
||||||
_sentinel = object()
|
_sentinel = object()
|
||||||
|
|
||||||
|
|
|
@ -1029,6 +1029,43 @@ class _TestQueue(BaseTestCase):
|
||||||
self.assertTrue(q.get(timeout=1.0))
|
self.assertTrue(q.get(timeout=1.0))
|
||||||
close_queue(q)
|
close_queue(q)
|
||||||
|
|
||||||
|
def test_queue_feeder_on_queue_feeder_error(self):
|
||||||
|
# bpo-30006: verify feeder handles exceptions using the
|
||||||
|
# _on_queue_feeder_error hook.
|
||||||
|
if self.TYPE != 'processes':
|
||||||
|
self.skipTest('test not appropriate for {}'.format(self.TYPE))
|
||||||
|
|
||||||
|
class NotSerializable(object):
|
||||||
|
"""Mock unserializable object"""
|
||||||
|
def __init__(self):
|
||||||
|
self.reduce_was_called = False
|
||||||
|
self.on_queue_feeder_error_was_called = False
|
||||||
|
|
||||||
|
def __reduce__(self):
|
||||||
|
self.reduce_was_called = True
|
||||||
|
raise AttributeError
|
||||||
|
|
||||||
|
class SafeQueue(multiprocessing.queues.Queue):
|
||||||
|
"""Queue with overloaded _on_queue_feeder_error hook"""
|
||||||
|
@staticmethod
|
||||||
|
def _on_queue_feeder_error(e, obj):
|
||||||
|
if (isinstance(e, AttributeError) and
|
||||||
|
isinstance(obj, NotSerializable)):
|
||||||
|
obj.on_queue_feeder_error_was_called = True
|
||||||
|
|
||||||
|
not_serializable_obj = NotSerializable()
|
||||||
|
# The captured_stderr reduces the noise in the test report
|
||||||
|
with test.support.captured_stderr():
|
||||||
|
q = SafeQueue(ctx=multiprocessing.get_context())
|
||||||
|
q.put(not_serializable_obj)
|
||||||
|
|
||||||
|
# Verify that q is still functionning correctly
|
||||||
|
q.put(True)
|
||||||
|
self.assertTrue(q.get(timeout=1.0))
|
||||||
|
|
||||||
|
# Assert that the serialization and the hook have been called correctly
|
||||||
|
self.assertTrue(not_serializable_obj.reduce_was_called)
|
||||||
|
self.assertTrue(not_serializable_obj.on_queue_feeder_error_was_called)
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
|
|
|
@ -18,6 +18,7 @@ import threading
|
||||||
import time
|
import time
|
||||||
import unittest
|
import unittest
|
||||||
import weakref
|
import weakref
|
||||||
|
from pickle import PicklingError
|
||||||
|
|
||||||
from concurrent import futures
|
from concurrent import futures
|
||||||
from concurrent.futures._base import (
|
from concurrent.futures._base import (
|
||||||
|
@ -394,16 +395,17 @@ class ProcessPoolShutdownTest(ExecutorShutdownTest):
|
||||||
queue_management_thread = executor._queue_management_thread
|
queue_management_thread = executor._queue_management_thread
|
||||||
processes = executor._processes
|
processes = executor._processes
|
||||||
call_queue = executor._call_queue
|
call_queue = executor._call_queue
|
||||||
|
queue_management_thread = executor._queue_management_thread
|
||||||
del executor
|
del executor
|
||||||
|
|
||||||
|
# Make sure that all the executor ressources were properly cleaned by
|
||||||
|
# the shutdown process
|
||||||
queue_management_thread.join()
|
queue_management_thread.join()
|
||||||
for p in processes.values():
|
for p in processes.values():
|
||||||
p.join()
|
p.join()
|
||||||
call_queue.close()
|
|
||||||
call_queue.join_thread()
|
call_queue.join_thread()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
create_executor_tests(ProcessPoolShutdownTest,
|
create_executor_tests(ProcessPoolShutdownTest,
|
||||||
executor_mixins=(ProcessPoolForkMixin,
|
executor_mixins=(ProcessPoolForkMixin,
|
||||||
ProcessPoolForkserverMixin,
|
ProcessPoolForkserverMixin,
|
||||||
|
@ -784,6 +786,172 @@ create_executor_tests(ProcessPoolExecutorTest,
|
||||||
ProcessPoolForkserverMixin,
|
ProcessPoolForkserverMixin,
|
||||||
ProcessPoolSpawnMixin))
|
ProcessPoolSpawnMixin))
|
||||||
|
|
||||||
|
def hide_process_stderr():
|
||||||
|
import io
|
||||||
|
sys.stderr = io.StringIO()
|
||||||
|
|
||||||
|
|
||||||
|
def _crash(delay=None):
|
||||||
|
"""Induces a segfault."""
|
||||||
|
if delay:
|
||||||
|
time.sleep(delay)
|
||||||
|
import faulthandler
|
||||||
|
faulthandler.disable()
|
||||||
|
faulthandler._sigsegv()
|
||||||
|
|
||||||
|
|
||||||
|
def _exit():
|
||||||
|
"""Induces a sys exit with exitcode 1."""
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def _raise_error(Err):
|
||||||
|
"""Function that raises an Exception in process."""
|
||||||
|
hide_process_stderr()
|
||||||
|
raise Err()
|
||||||
|
|
||||||
|
|
||||||
|
def _return_instance(cls):
|
||||||
|
"""Function that returns a instance of cls."""
|
||||||
|
hide_process_stderr()
|
||||||
|
return cls()
|
||||||
|
|
||||||
|
|
||||||
|
class CrashAtPickle(object):
|
||||||
|
"""Bad object that triggers a segfault at pickling time."""
|
||||||
|
def __reduce__(self):
|
||||||
|
_crash()
|
||||||
|
|
||||||
|
|
||||||
|
class CrashAtUnpickle(object):
|
||||||
|
"""Bad object that triggers a segfault at unpickling time."""
|
||||||
|
def __reduce__(self):
|
||||||
|
return _crash, ()
|
||||||
|
|
||||||
|
|
||||||
|
class ExitAtPickle(object):
|
||||||
|
"""Bad object that triggers a process exit at pickling time."""
|
||||||
|
def __reduce__(self):
|
||||||
|
_exit()
|
||||||
|
|
||||||
|
|
||||||
|
class ExitAtUnpickle(object):
|
||||||
|
"""Bad object that triggers a process exit at unpickling time."""
|
||||||
|
def __reduce__(self):
|
||||||
|
return _exit, ()
|
||||||
|
|
||||||
|
|
||||||
|
class ErrorAtPickle(object):
|
||||||
|
"""Bad object that triggers an error at pickling time."""
|
||||||
|
def __reduce__(self):
|
||||||
|
from pickle import PicklingError
|
||||||
|
raise PicklingError("Error in pickle")
|
||||||
|
|
||||||
|
|
||||||
|
class ErrorAtUnpickle(object):
|
||||||
|
"""Bad object that triggers an error at unpickling time."""
|
||||||
|
def __reduce__(self):
|
||||||
|
from pickle import UnpicklingError
|
||||||
|
return _raise_error, (UnpicklingError, )
|
||||||
|
|
||||||
|
|
||||||
|
class ExecutorDeadlockTest:
|
||||||
|
TIMEOUT = 15
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _sleep_id(cls, x, delay):
|
||||||
|
time.sleep(delay)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def _fail_on_deadlock(self, executor):
|
||||||
|
# If we did not recover before TIMEOUT seconds, consider that the
|
||||||
|
# executor is in a deadlock state and forcefully clean all its
|
||||||
|
# composants.
|
||||||
|
import faulthandler
|
||||||
|
from tempfile import TemporaryFile
|
||||||
|
with TemporaryFile(mode="w+") as f:
|
||||||
|
faulthandler.dump_traceback(file=f)
|
||||||
|
f.seek(0)
|
||||||
|
tb = f.read()
|
||||||
|
for p in executor._processes.values():
|
||||||
|
p.terminate()
|
||||||
|
# This should be safe to call executor.shutdown here as all possible
|
||||||
|
# deadlocks should have been broken.
|
||||||
|
executor.shutdown(wait=True)
|
||||||
|
print(f"\nTraceback:\n {tb}", file=sys.__stderr__)
|
||||||
|
self.fail(f"Executor deadlock:\n\n{tb}")
|
||||||
|
|
||||||
|
|
||||||
|
def test_crash(self):
|
||||||
|
# extensive testing for deadlock caused by crashes in a pool.
|
||||||
|
self.executor.shutdown(wait=True)
|
||||||
|
crash_cases = [
|
||||||
|
# Check problem occuring while pickling a task in
|
||||||
|
# the task_handler thread
|
||||||
|
(id, (ErrorAtPickle(),), PicklingError, "error at task pickle"),
|
||||||
|
# Check problem occuring while unpickling a task on workers
|
||||||
|
(id, (ExitAtUnpickle(),), BrokenProcessPool,
|
||||||
|
"exit at task unpickle"),
|
||||||
|
(id, (ErrorAtUnpickle(),), BrokenProcessPool,
|
||||||
|
"error at task unpickle"),
|
||||||
|
(id, (CrashAtUnpickle(),), BrokenProcessPool,
|
||||||
|
"crash at task unpickle"),
|
||||||
|
# Check problem occuring during func execution on workers
|
||||||
|
(_crash, (), BrokenProcessPool,
|
||||||
|
"crash during func execution on worker"),
|
||||||
|
(_exit, (), SystemExit,
|
||||||
|
"exit during func execution on worker"),
|
||||||
|
(_raise_error, (RuntimeError, ), RuntimeError,
|
||||||
|
"error during func execution on worker"),
|
||||||
|
# Check problem occuring while pickling a task result
|
||||||
|
# on workers
|
||||||
|
(_return_instance, (CrashAtPickle,), BrokenProcessPool,
|
||||||
|
"crash during result pickle on worker"),
|
||||||
|
(_return_instance, (ExitAtPickle,), SystemExit,
|
||||||
|
"exit during result pickle on worker"),
|
||||||
|
(_return_instance, (ErrorAtPickle,), PicklingError,
|
||||||
|
"error during result pickle on worker"),
|
||||||
|
# Check problem occuring while unpickling a task in
|
||||||
|
# the result_handler thread
|
||||||
|
(_return_instance, (ErrorAtUnpickle,), BrokenProcessPool,
|
||||||
|
"error during result unpickle in result_handler"),
|
||||||
|
(_return_instance, (ExitAtUnpickle,), BrokenProcessPool,
|
||||||
|
"exit during result unpickle in result_handler")
|
||||||
|
]
|
||||||
|
for func, args, error, name in crash_cases:
|
||||||
|
with self.subTest(name):
|
||||||
|
# The captured_stderr reduces the noise in the test report
|
||||||
|
with test.support.captured_stderr():
|
||||||
|
executor = self.executor_type(
|
||||||
|
max_workers=2, mp_context=get_context(self.ctx))
|
||||||
|
res = executor.submit(func, *args)
|
||||||
|
with self.assertRaises(error):
|
||||||
|
try:
|
||||||
|
res.result(timeout=self.TIMEOUT)
|
||||||
|
except futures.TimeoutError:
|
||||||
|
# If we did not recover before TIMEOUT seconds,
|
||||||
|
# consider that the executor is in a deadlock state
|
||||||
|
self._fail_on_deadlock(executor)
|
||||||
|
executor.shutdown(wait=True)
|
||||||
|
|
||||||
|
def test_shutdown_deadlock(self):
|
||||||
|
# Test that the pool calling shutdown do not cause deadlock
|
||||||
|
# if a worker fails after the shutdown call.
|
||||||
|
self.executor.shutdown(wait=True)
|
||||||
|
with self.executor_type(max_workers=2,
|
||||||
|
mp_context=get_context(self.ctx)) as executor:
|
||||||
|
self.executor = executor # Allow clean up in fail_on_deadlock
|
||||||
|
f = executor.submit(_crash, delay=.1)
|
||||||
|
executor.shutdown(wait=True)
|
||||||
|
with self.assertRaises(BrokenProcessPool):
|
||||||
|
f.result()
|
||||||
|
|
||||||
|
|
||||||
|
create_executor_tests(ExecutorDeadlockTest,
|
||||||
|
executor_mixins=(ProcessPoolForkMixin,
|
||||||
|
ProcessPoolForkserverMixin,
|
||||||
|
ProcessPoolSpawnMixin))
|
||||||
|
|
||||||
|
|
||||||
class FutureTests(BaseTestCase):
|
class FutureTests(BaseTestCase):
|
||||||
def test_done_callback_with_result(self):
|
def test_done_callback_with_result(self):
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
Fix deadlocks in :class:`concurrent.futures.ProcessPoolExecutor` when
|
||||||
|
task arguments or results cause pickling or unpickling errors.
|
||||||
|
This should make sure that calls to the :class:`ProcessPoolExecutor` API
|
||||||
|
always eventually return.
|
Loading…
Add table
Add a link
Reference in a new issue