Thread 16 (Thread 0x7fffe4c5c700 (LWP 945481)): #0 0x00007fffebf3c4ed in __lll_lock_wait () from /lib64/libpthread.so.0 #1 0x00007fffebf37dcb in _L_lock_883 () from /lib64/libpthread.so.0 #2 0x00007fffebf37c98 in pthread_mutex_lock () from /lib64/libpthread.so.0 #3 0x0000555555fb2b77 in ceph::mutex_debug_detail::mutex_debug_impl<false>::lock_impl (this=0x7fffffffd688) at /home/runsisi/build/master/src/common/mutex_debug.h:121 #4 0x0000555555fb23c2 in ceph::mutex_debug_detail::mutex_debug_impl<false>::lock (this=0x7fffffffd688, no_lockdep=false) at /home/runsisi/build/master/src/common/mutex_debug.h:185 #5 0x0000555555fb1cfb in std::lock_guard<ceph::mutex_debug_detail::mutex_debug_impl<false> >::lock_guard (this=0x7fffe4c593c0, __m=...) at /opt/rh/devtoolset-8/root/usr/include/c++/8/bits/std_mutex.h:162 #6 0x00005555560f0ff8 in MgrStandby::ms_dispatch2 (this=0x7fffffffb460, m=...) at /home/runsisi/build/master/src/mgr/MgrStandby.cc:435 #7 0x00007fffeebe1e6d in Messenger::ms_deliver_dispatch (this=0x5555571d6900, m=...) at /home/runsisi/build/master/src/msg/Messenger.h:703 #8 0x00007fffeebe090a in DispatchQueue::entry (this=0x5555571d6c60) at /home/runsisi/build/master/src/msg/DispatchQueue.cc:201 #9 0x00007fffeed5ecec in DispatchQueue::DispatchThread::entry (this=0x5555571d6dd8) at /home/runsisi/build/master/src/msg/DispatchQueue.h:101 #10 0x00007fffee9c3796 in Thread::entry_wrapper (this=0x5555571d6dd8) at /home/runsisi/build/master/src/common/Thread.cc:91 #11 0x00007fffee9c3714 in Thread::_entry_func (arg=0x5555571d6dd8) at /home/runsisi/build/master/src/common/Thread.cc:75 #12 0x00007fffebf35dd5 in start_thread () from /lib64/libpthread.so.0 #13 0x00007fffeb013ead in clone () from /lib64/libc.so.6 (gdb) thread apply 25 bt
Thread 25 (Thread 0x7fffe0453700 (LWP 945491)): #0 0x00007fffebf3c4ed in __lll_lock_wait () from /lib64/libpthread.so.0 #1 0x00007fffebf37dcb in _L_lock_883 () from /lib64/libpthread.so.0 #2 0x00007fffebf37c98 in pthread_mutex_lock () from /lib64/libpthread.so.0 #3 0x00007fffee8d0baf in ceph::mutex_debug_detail::mutex_debug_impl<false>::lock_impl (this=0x7fffffffd688) at /home/runsisi/build/master/src/common/mutex_debug.h:121 #4 0x00007fffee8cee8e in ceph::mutex_debug_detail::mutex_debug_impl<false>::lock (this=0x7fffffffd688, no_lockdep=false) at /home/runsisi/build/master/src/common/mutex_debug.h:185 #5 0x00007fffee8cef6a in std::unique_lock<ceph::mutex_debug_detail::mutex_debug_impl<false> >::lock (this=0x7fffe04505a0) at /opt/rh/devtoolset-8/root/usr/include/c++/8/bits/std_mutex.h:267 #6 0x00007fffee8cd15e in std::unique_lock<ceph::mutex_debug_detail::mutex_debug_impl<false> >::unique_lock (this=0x7fffe04505a0, __m=...) at /opt/rh/devtoolset-8/root/usr/include/c++/8/bits/std_mutex.h:197 #7 0x00007fffee9cfd18 in SafeTimer::timer_thread (this=0x7fffffffd800) at /home/runsisi/build/master/src/common/Timer.cc:76 #8 0x00007fffee9d1af6 in SafeTimerThread::entry (this=0x55555716cb70) at /home/runsisi/build/master/src/common/Timer.cc:32 #9 0x00007fffee9c3796 in Thread::entry_wrapper (this=0x55555716cb70) at /home/runsisi/build/master/src/common/Thread.cc:91 #10 0x00007fffee9c3714 in Thread::_entry_func (arg=0x55555716cb70) at /home/runsisi/build/master/src/common/Thread.cc:75 #11 0x00007fffebf35dd5 in start_thread () from /lib64/libpthread.so.0 #12 0x00007fffeb013ead in clone () from /lib64/libc.so.6
在等待同一把锁:
1 2 3 4 5 6 7 8 9 10 11 12 13 14
(gdb) thread apply 16 f 6
Thread 16 (Thread 0x7fffe4c5c700 (LWP 945481)): #6 0x00005555560f0ff8 in MgrStandby::ms_dispatch2 (this=0x7fffffffb460, m=...) at /home/runsisi/build/master/src/mgr/MgrStandby.cc:435 435 std::lock_guard l(lock); (gdb) p &lock $10 = (ceph::mutex *) 0x7fffffffd688 (gdb) thread apply 25 f 7
Thread 25 (Thread 0x7fffe0453700 (LWP 945491)): #7 0x00007fffee9cfd18 in SafeTimer::timer_thread (this=0x7fffffffd800) at /home/runsisi/build/master/src/common/Timer.cc:76 76 std::unique_lock l{lock}; (gdb) p &lock $11 = (ceph::mutex *) 0x7fffffffd688
(gdb) thread 1 [Switching to thread 1 (Thread 0x7ffff7fd1fc0 (LWP 945310))] #0 0x00007fffebf39d12 in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 (gdb) bt #0 0x00007fffebf39d12 in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00007fffed140ab8 in take_gil () from /lib64/libpython3.6m.so.1.0 #2 0x00007fffed140bf9 in PyEval_RestoreThread () from /lib64/libpython3.6m.so.1.0 #3 0x00007fffed1d5036 in PyGILState_Ensure () from /lib64/libpython3.6m.so.1.0 #4 0x00007fffa4074a05 in ?? () from /home/runsisi/.local/lib/python3.6/site-packages/scipy/fft/_pocketfft/pypocketfft.cpython-36m-x86_64-linux-gnu.so #5 0x00007fffa4061b45 in ?? () from /home/runsisi/.local/lib/python3.6/site-packages/scipy/fft/_pocketfft/pypocketfft.cpython-36m-x86_64-linux-gnu.so #6 0x00007fffa407627a in ?? () from /home/runsisi/.local/lib/python3.6/site-packages/scipy/fft/_pocketfft/pypocketfft.cpython-36m-x86_64-linux-gnu.so #7 0x00007fffa4061f32 in ?? () from /home/runsisi/.local/lib/python3.6/site-packages/scipy/fft/_pocketfft/pypocketfft.cpython-36m-x86_64-linux-gnu.so #8 0x00007fffa4063650 in PyInit_pypocketfft () from /home/runsisi/.local/lib/python3.6/site-packages/scipy/fft/_pocketfft/pypocketfft.cpython-36m-x86_64-linux-gnu.so #9 0x00007fffed1d19b0 in _PyImport_LoadDynamicModuleWithSpec () from /lib64/libpython3.6m.so.1.0 ... #222 0x00007fffed165a76 in PyImport_Import () from /lib64/libpython3.6m.so.1.0 #223 0x00007fffed165bcb in PyImport_ImportModule () from /lib64/libpython3.6m.so.1.0 #224 0x000055555612089b in PyModule::load_subclass_of (this=0x555557135a50, base_class=0x55555643b6ea "MgrModule", py_class=0x555557135b08) at /home/runsisi/build/master/src/mgr/PyModule.cc:649 #225 0x000055555611d0d6 in PyModule::load (this=0x555557135a50, pMainThreadState=0x5555571d6000) at /home/runsisi/build/master/src/mgr/PyModule.cc:335 #226 0x0000555556127ef4 in PyModuleRegistry::init (this=0x7fffffffd8c8) at /home/runsisi/build/master/src/mgr/PyModuleRegistry.cc:86 #227 0x00005555560eddbd in MgrStandby::init (this=0x7fffffffb460) at /home/runsisi/build/master/src/mgr/MgrStandby.cc:184 #228 0x0000555555ebf09c in main (argc=4, argv=0x7fffffffdc88) at /home/runsisi/build/master/src/ceph_mgr.cc:71
从 1 号线程的调用栈来看,在加载 mgr 模块(diskprediction_local)的过程中调用 PyEval_RestoreThread 接口等待 GIL 锁。但是,1 号线程在调用 PyModule::load 加载 mgr 模块时,已经调用 PyEval_RestoreThread 接口拿到了 GIL 锁,PyEval_RestoreThread 接口文档明确提及在同一个线程中不能重复调用,否则会发生死锁(If the lock has been created, the current thread must not have acquired it, otherwise deadlock ensues),显然,16、25 两个线程等锁只是表象,真正的死锁发生在 1 号线程内部。
(gdb) bt #0 0x00007fffebf39d12 in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00007fffed140ab8 in PyCOND_TIMEDWAIT (cond=0x7fffed50c580 <gil_cond>, mut=0x7fffed50c540 <gil_mutex>, us=5000) at /usr/src/debug/Python-3.6.8/Python/condvar.h:103 #2 take_gil (tstate=tstate@entry=0x5555571d6900) at /usr/src/debug/Python-3.6.8/Python/ceval_gil.h:224 #3 0x00007fffed140bf9 in PyEval_RestoreThread (tstate=tstate@entry=0x5555571d6900) at /usr/src/debug/Python-3.6.8/Python/ceval.c:369 #4 0x00007fffed1d5036 in PyGILState_Ensure () at /usr/src/debug/Python-3.6.8/Python/pystate.c:895 #5 0x00007fffd4e8fc69 in pybind11::detail::get_internals()::gil_scoped_acquire_local::gil_scoped_acquire_local() (this=0x7fffffff7550) at /home/runsisi/build/pyleak/pybind11/include/pybind11/detail/internals.h:253 #6 0x00007fffd4e900b8 in pybind11::detail::get_internals () at /home/runsisi/build/pyleak/pybind11/include/pybind11/detail/internals.h:256 #7 0x00007fffd4e8c852 in x::PyInit_x () at /home/runsisi/build/pyleak/src/x.cc:11 #8 0x00007fffed1d19b0 in _PyImport_LoadDynamicModuleWithSpec (spec=spec@entry=0x7fffd76eb208, fp=fp@entry=0x0) at /usr/src/debug/Python-3.6.8/Python/importdl.c:159 ... #106 0x00007fffed165a76 in PyImport_Import (module_name=module_name@entry=0x7fffd9463228) at /usr/src/debug/Python-3.6.8/Python/import.c:1767 #107 0x00007fffed165bcb in PyImport_ImportModule (name=<optimized out>) at /usr/src/debug/Python-3.6.8/Python/import.c:1269 #108 0x000055555612089b in PyModule::load_subclass_of (this=0x5555571355f0, base_class=0x55555643b6ea "MgrModule", py_class=0x5555571356a8) at /home/runsisi/build/master/src/mgr/PyModule.cc:649 #109 0x000055555611d0d6 in PyModule::load (this=0x5555571355f0, pMainThreadState=0x5555571d6900) at /home/runsisi/build/master/src/mgr/PyModule.cc:335 #110 0x0000555556127ef4 in PyModuleRegistry::init (this=0x7fffffffd8c8) at /home/runsisi/build/master/src/mgr/PyModuleRegistry.cc:86 #111 0x00005555560eddbd in MgrStandby::init (this=0x7fffffffb460) at /home/runsisi/build/master/src/mgr/MgrStandby.cc:184 #112 0x0000555555ebf09c in main (argc=4, argv=0x7fffffffdc88) at /home/runsisi/build/master/src/ceph_mgr.cc:71
PYBIND11_NOINLINE inline internals &get_internals(){ auto **&internals_pp = get_internals_pp(); if (internals_pp && *internals_pp) return **internals_pp;
// Ensure that the GIL is held since we will need to make Python calls. // Cannot use py::gil_scoped_acquire here since that constructor calls get_internals. structgil_scoped_acquire_local { gil_scoped_acquire_local() : state (PyGILState_Ensure()) {} ~gil_scoped_acquire_local() { PyGILState_Release(state); } const PyGILState_STATE state; } gil; ... }
Python Thread State & GIL(基于 Python 3.6.8)
接着来看一下 PyGILState_Ensure 的行为:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
PyGILState_STATE PyGILState_Ensure(void) { int current; PyThreadState *tcur;
tcur = (PyThreadState *)PyThread_get_key_value(autoTLSkey); current = PyThreadState_IsCurrent(tcur);
if (current == 0) { PyEval_RestoreThread(tcur); }
return current ? PyGILState_LOCKED : PyGILState_UNLOCKED; }
1 2 3 4 5 6 7
staticint PyThreadState_IsCurrent(PyThreadState *tstate) { /* Must be the tstate for this thread */ assert(PyGILState_GetThisThreadState()==tstate); return tstate == GET_TSTATE(); }
显然,由于 current == 0,或者说 autoTLSkey TLS 中记录的 PyThreadState 实例(运行 Python 解释器的 OS 线程必须创建对应的 PyThreadState 实例)与当前全局 PyThreadState 实例 _PyThreadState_Current(通过 GET_TSTATE / SET_TSTATE 访问)不一致,导致 PyEval_RestoreThread 被调用。或者说,PyGILState_* 这些 API 基于一个简单的假设,如果当前 OS 线程 TLS 记录的 tstate(即 PyThreadState 实例)与全局 tstate 不一致,则表示当前 OS 线程没有拿到 GIL 锁,因此,PyGILState_Ensure 就会调用 PyEval_RestoreThread 去拿 GIL 锁。
线程 TLS 记录的 tstate 为何会出现与 _PyThreadState_Current 全局变量记录的 tstate 不一致?确实,一种情况是因为该线程确实没有拿 GIL 锁,但还一种情况就是我们这里分析的:在同一 OS 线程里使用多个 Python 解释器。
// create gil and let the current tstate, i.e., the main tstate takes the gil PyEval_InitThreads();
pMainThreadState = PyEval_SaveThread();
// load each mgr module in a separate newly created sub-interpreter for (...) { { SafeThreadState sts(pMainThreadState); Gil gil(sts); // main tstate (associated with the main interpreter) takes gil
// create sub-interpreter and returns the sub-tstate associated with the sub-interpreter auto thread_state = Py_NewInterpreter(); pMyThreadState.set(thread_state); } { Gil gil(pMyThreadState); // sub-tstate (associated with the sub-interpreter) takes gil PyImport_ImportModule(...) // imported mgr modules are associated with the current tstate, i.e., the sub-tstate } }
其中 Gil 只是一个使用 C++ RAII 特性实现的 GIL 管理实例:
1 2 3 4 5 6 7 8 9 10 11
Gil::Gil(SafeThreadState &ts, bool new_thread) : pThreadState(ts) { // Acquire the GIL, set the current thread state PyEval_RestoreThread(pThreadState.ts); }
Gil::~Gil() { // Release the GIL, reset the thread state to NULL PyEval_SaveThread(); }
Also note that combining this functionality with PyGILState_*() APIs is delicate, because these APIs assume a bijection between Python thread states and OS-level threads, an assumption broken by the presence of sub-interpreters. It is highly recommended that you don’t switch sub-interpreters between a pair of matching PyGILState_Ensure() and PyGILState_Release() calls. Furthermore, extensions (such as ctypes) using these APIs to allow calling of Python code from non-Python created threads will probably be broken when using sub-interpreters.