Skip to content

Commit

Permalink
gh-118332: Fix deadlock involving stop the world (#118412)
Browse files Browse the repository at this point in the history
Avoid detaching thread state when stopping the world. When re-attaching
the thread state, the thread would attempt to resume the top-most
critical section, which might now be held by a thread paused for our
stop-the-world request.
  • Loading branch information
colesbury committed Apr 30, 2024
1 parent 4a1cf66 commit b2c3b70
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 7 deletions.
6 changes: 4 additions & 2 deletions Include/internal/pycore_lock.h
Expand Up @@ -150,8 +150,10 @@ PyAPI_FUNC(void) PyEvent_Wait(PyEvent *evt);

// Wait for the event to be set, or until the timeout expires. If the event is
// already set, then this returns immediately. Returns 1 if the event was set,
// and 0 if the timeout expired or thread was interrupted.
PyAPI_FUNC(int) PyEvent_WaitTimed(PyEvent *evt, PyTime_t timeout_ns);
// and 0 if the timeout expired or thread was interrupted. If `detach` is
// true, then the thread will detach/release the GIL while waiting.
PyAPI_FUNC(int)
PyEvent_WaitTimed(PyEvent *evt, PyTime_t timeout_ns, int detach);

// _PyRawMutex implements a word-sized mutex that that does not depend on the
// parking lot API, and therefore can be used in the parking lot
Expand Down
85 changes: 85 additions & 0 deletions Modules/_testinternalcapi/test_critical_sections.c
Expand Up @@ -204,6 +204,90 @@ test_critical_sections_threads(PyObject *self, PyObject *Py_UNUSED(args))
Py_DECREF(test_data.obj1);
Py_RETURN_NONE;
}

static void
pysleep(int ms)
{
#ifdef MS_WINDOWS
Sleep(ms);
#else
usleep(ms * 1000);
#endif
}

struct test_data_gc {
PyObject *obj;
Py_ssize_t num_threads;
Py_ssize_t id;
Py_ssize_t countdown;
PyEvent done_event;
PyEvent ready;
};

static void
thread_gc(void *arg)
{
struct test_data_gc *test_data = arg;
PyGILState_STATE gil = PyGILState_Ensure();

Py_ssize_t id = _Py_atomic_add_ssize(&test_data->id, 1);
if (id == test_data->num_threads - 1) {
_PyEvent_Notify(&test_data->ready);
}
else {
// wait for all test threads to more reliably reproduce the issue.
PyEvent_Wait(&test_data->ready);
}

if (id == 0) {
Py_BEGIN_CRITICAL_SECTION(test_data->obj);
// pause long enough that the lock would be handed off directly to
// a waiting thread.
pysleep(5);
PyGC_Collect();
Py_END_CRITICAL_SECTION();
}
else if (id == 1) {
pysleep(1);
Py_BEGIN_CRITICAL_SECTION(test_data->obj);
pysleep(1);
Py_END_CRITICAL_SECTION();
}
else if (id == 2) {
// sleep long enough so that thread 0 is waiting to stop the world
pysleep(6);
Py_BEGIN_CRITICAL_SECTION(test_data->obj);
pysleep(1);
Py_END_CRITICAL_SECTION();
}

PyGILState_Release(gil);
if (_Py_atomic_add_ssize(&test_data->countdown, -1) == 1) {
// last thread to finish sets done_event
_PyEvent_Notify(&test_data->done_event);
}
}

static PyObject *
test_critical_sections_gc(PyObject *self, PyObject *Py_UNUSED(args))
{
// gh-118332: Contended critical sections should not deadlock with GC
const Py_ssize_t NUM_THREADS = 3;
struct test_data_gc test_data = {
.obj = PyDict_New(),
.countdown = NUM_THREADS,
.num_threads = NUM_THREADS,
};
assert(test_data.obj != NULL);

for (int i = 0; i < NUM_THREADS; i++) {
PyThread_start_new_thread(&thread_gc, &test_data);
}
PyEvent_Wait(&test_data.done_event);
Py_DECREF(test_data.obj);
Py_RETURN_NONE;
}

#endif

static PyMethodDef test_methods[] = {
Expand All @@ -212,6 +296,7 @@ static PyMethodDef test_methods[] = {
{"test_critical_sections_suspend", test_critical_sections_suspend, METH_NOARGS},
#ifdef Py_CAN_START_THREADS
{"test_critical_sections_threads", test_critical_sections_threads, METH_NOARGS},
{"test_critical_sections_gc", test_critical_sections_gc, METH_NOARGS},
#endif
{NULL, NULL} /* sentinel */
};
Expand Down
3 changes: 2 additions & 1 deletion Modules/_threadmodule.c
Expand Up @@ -501,7 +501,8 @@ ThreadHandle_join(ThreadHandle *self, PyTime_t timeout_ns)

// Wait until the deadline for the thread to exit.
PyTime_t deadline = timeout_ns != -1 ? _PyDeadline_Init(timeout_ns) : 0;
while (!PyEvent_WaitTimed(&self->thread_is_exiting, timeout_ns)) {
int detach = 1;
while (!PyEvent_WaitTimed(&self->thread_is_exiting, timeout_ns, detach)) {
if (deadline) {
// _PyDeadline_Get will return a negative value if the deadline has
// been exceeded.
Expand Down
6 changes: 3 additions & 3 deletions Python/lock.c
Expand Up @@ -277,12 +277,12 @@ _PyEvent_Notify(PyEvent *evt)
void
PyEvent_Wait(PyEvent *evt)
{
while (!PyEvent_WaitTimed(evt, -1))
while (!PyEvent_WaitTimed(evt, -1, /*detach=*/1))
;
}

int
PyEvent_WaitTimed(PyEvent *evt, PyTime_t timeout_ns)
PyEvent_WaitTimed(PyEvent *evt, PyTime_t timeout_ns, int detach)
{
for (;;) {
uint8_t v = _Py_atomic_load_uint8(&evt->v);
Expand All @@ -298,7 +298,7 @@ PyEvent_WaitTimed(PyEvent *evt, PyTime_t timeout_ns)

uint8_t expected = _Py_HAS_PARKED;
(void) _PyParkingLot_Park(&evt->v, &expected, sizeof(evt->v),
timeout_ns, NULL, 1);
timeout_ns, NULL, detach);

return _Py_atomic_load_uint8(&evt->v) == _Py_LOCKED;
}
Expand Down
3 changes: 2 additions & 1 deletion Python/pystate.c
Expand Up @@ -2238,7 +2238,8 @@ stop_the_world(struct _stoptheworld_state *stw)
}

PyTime_t wait_ns = 1000*1000; // 1ms (arbitrary, may need tuning)
if (PyEvent_WaitTimed(&stw->stop_event, wait_ns)) {
int detach = 0;
if (PyEvent_WaitTimed(&stw->stop_event, wait_ns, detach)) {
assert(stw->thread_countdown == 0);
break;
}
Expand Down

0 comments on commit b2c3b70

Please sign in to comment.