gh-137433: Fix deadlock with stop-the-world and daemon threads (gh-137735)
There was a deadlock originally seen by Memray when a daemon thread enabled or disabled profiling while the interpreter was shutting down. I think this could also happen with garbage collection, but I haven't seen that in practice. The daemon thread could be hung while trying acquire the global rwmutex that prevents overlapping global and per-interpreter stop-the-world events. Since it already held the main interpreter's stop-the-world lock, it also deadlocked the main thread, which is trying to perform interpreter finalization. Swap the order of lock acquisition to prevent this deadlock. Additionally, refactor `_PyParkingLot_Park` so that the global buckets hashtable is left in a clean state if the thread is hung in `PyEval_AcquireThread`.
This commit is contained in:
@@ -46,10 +46,8 @@ typedef struct _PySemaphore {
|
||||
} _PySemaphore;
|
||||
|
||||
// Puts the current thread to sleep until _PySemaphore_Wakeup() is called.
|
||||
// If `detach` is true, then the thread will detach/release the GIL while
|
||||
// sleeping.
|
||||
PyAPI_FUNC(int)
|
||||
_PySemaphore_Wait(_PySemaphore *sema, PyTime_t timeout_ns, int detach);
|
||||
_PySemaphore_Wait(_PySemaphore *sema, PyTime_t timeout_ns);
|
||||
|
||||
// Wakes up a single thread waiting on sema. Note that _PySemaphore_Wakeup()
|
||||
// can be called before _PySemaphore_Wait().
|
||||
|
||||
@@ -1430,6 +1430,33 @@ class ThreadTests(BaseTestCase):
|
||||
self.assertEqual(len(native_ids), 2)
|
||||
self.assertNotEqual(native_ids[0], native_ids[1])
|
||||
|
||||
def test_stop_the_world_during_finalization(self):
|
||||
# gh-137433: Test functions that trigger a stop-the-world in the free
|
||||
# threading build concurrent with interpreter finalization.
|
||||
script = """if True:
|
||||
import gc
|
||||
import sys
|
||||
import threading
|
||||
NUM_THREADS = 5
|
||||
b = threading.Barrier(NUM_THREADS + 1)
|
||||
def run_in_bg():
|
||||
b.wait()
|
||||
while True:
|
||||
sys.setprofile(None)
|
||||
gc.collect()
|
||||
|
||||
for _ in range(NUM_THREADS):
|
||||
t = threading.Thread(target=run_in_bg, daemon=True)
|
||||
t.start()
|
||||
|
||||
b.wait()
|
||||
print("Exiting...")
|
||||
"""
|
||||
rc, out, err = assert_python_ok('-c', script)
|
||||
self.assertEqual(rc, 0)
|
||||
self.assertEqual(err, b"")
|
||||
self.assertEqual(out.strip(), b"Exiting...")
|
||||
|
||||
class ThreadJoinOnShutdown(BaseTestCase):
|
||||
|
||||
def _run_and_join(self, script):
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
Fix a potential deadlock in the :term:`free threading` build when daemon
|
||||
threads enable or disable profiling or tracing while the main thread is
|
||||
shutting down the interpreter.
|
||||
@@ -227,7 +227,7 @@ _PyRawMutex_LockSlow(_PyRawMutex *m)
|
||||
|
||||
// Wait for us to be woken up. Note that we still have to lock the
|
||||
// mutex ourselves: it is NOT handed off to us.
|
||||
_PySemaphore_Wait(&waiter.sema, -1, /*detach=*/0);
|
||||
_PySemaphore_Wait(&waiter.sema, -1);
|
||||
}
|
||||
|
||||
_PySemaphore_Destroy(&waiter.sema);
|
||||
|
||||
@@ -91,8 +91,8 @@ _PySemaphore_Destroy(_PySemaphore *sema)
|
||||
#endif
|
||||
}
|
||||
|
||||
static int
|
||||
_PySemaphore_PlatformWait(_PySemaphore *sema, PyTime_t timeout)
|
||||
int
|
||||
_PySemaphore_Wait(_PySemaphore *sema, PyTime_t timeout)
|
||||
{
|
||||
int res;
|
||||
#if defined(MS_WINDOWS)
|
||||
@@ -225,27 +225,6 @@ _PySemaphore_PlatformWait(_PySemaphore *sema, PyTime_t timeout)
|
||||
return res;
|
||||
}
|
||||
|
||||
int
|
||||
_PySemaphore_Wait(_PySemaphore *sema, PyTime_t timeout, int detach)
|
||||
{
|
||||
PyThreadState *tstate = NULL;
|
||||
if (detach) {
|
||||
tstate = _PyThreadState_GET();
|
||||
if (tstate && _PyThreadState_IsAttached(tstate)) {
|
||||
// Only detach if we are attached
|
||||
PyEval_ReleaseThread(tstate);
|
||||
}
|
||||
else {
|
||||
tstate = NULL;
|
||||
}
|
||||
}
|
||||
int res = _PySemaphore_PlatformWait(sema, timeout);
|
||||
if (tstate) {
|
||||
PyEval_AcquireThread(tstate);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
void
|
||||
_PySemaphore_Wakeup(_PySemaphore *sema)
|
||||
{
|
||||
@@ -342,7 +321,19 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size,
|
||||
enqueue(bucket, addr, &wait);
|
||||
_PyRawMutex_Unlock(&bucket->mutex);
|
||||
|
||||
int res = _PySemaphore_Wait(&wait.sema, timeout_ns, detach);
|
||||
PyThreadState *tstate = NULL;
|
||||
if (detach) {
|
||||
tstate = _PyThreadState_GET();
|
||||
if (tstate && _PyThreadState_IsAttached(tstate)) {
|
||||
// Only detach if we are attached
|
||||
PyEval_ReleaseThread(tstate);
|
||||
}
|
||||
else {
|
||||
tstate = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
int res = _PySemaphore_Wait(&wait.sema, timeout_ns);
|
||||
if (res == Py_PARK_OK) {
|
||||
goto done;
|
||||
}
|
||||
@@ -354,7 +345,7 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size,
|
||||
// Another thread has started to unpark us. Wait until we process the
|
||||
// wakeup signal.
|
||||
do {
|
||||
res = _PySemaphore_Wait(&wait.sema, -1, detach);
|
||||
res = _PySemaphore_Wait(&wait.sema, -1);
|
||||
} while (res != Py_PARK_OK);
|
||||
goto done;
|
||||
}
|
||||
@@ -366,6 +357,9 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size,
|
||||
|
||||
done:
|
||||
_PySemaphore_Destroy(&wait.sema);
|
||||
if (tstate) {
|
||||
PyEval_AcquireThread(tstate);
|
||||
}
|
||||
return res;
|
||||
|
||||
}
|
||||
|
||||
@@ -2253,13 +2253,15 @@ stop_the_world(struct _stoptheworld_state *stw)
|
||||
{
|
||||
_PyRuntimeState *runtime = &_PyRuntime;
|
||||
|
||||
PyMutex_Lock(&stw->mutex);
|
||||
// gh-137433: Acquire the rwmutex first to avoid deadlocks with daemon
|
||||
// threads that may hang when blocked on lock acquisition.
|
||||
if (stw->is_global) {
|
||||
_PyRWMutex_Lock(&runtime->stoptheworld_mutex);
|
||||
}
|
||||
else {
|
||||
_PyRWMutex_RLock(&runtime->stoptheworld_mutex);
|
||||
}
|
||||
PyMutex_Lock(&stw->mutex);
|
||||
|
||||
HEAD_LOCK(runtime);
|
||||
stw->requested = 1;
|
||||
@@ -2325,13 +2327,13 @@ start_the_world(struct _stoptheworld_state *stw)
|
||||
}
|
||||
stw->requester = NULL;
|
||||
HEAD_UNLOCK(runtime);
|
||||
PyMutex_Unlock(&stw->mutex);
|
||||
if (stw->is_global) {
|
||||
_PyRWMutex_Unlock(&runtime->stoptheworld_mutex);
|
||||
}
|
||||
else {
|
||||
_PyRWMutex_RUnlock(&runtime->stoptheworld_mutex);
|
||||
}
|
||||
PyMutex_Unlock(&stw->mutex);
|
||||
}
|
||||
#endif // Py_GIL_DISABLED
|
||||
|
||||
|
||||
Reference in New Issue
Block a user