ansible: better support for diagnosing hangs
* Always enable the faulthandler module in the top-level process if it is available. * Make MITOGEN_DUMP_THREAD_STACKS interval configurable, to better handle larger runs. * Add docs subsection on diagnosing hangs. Conflicts: ansible_mitogen/process.py
This commit is contained in:
parent
e52684c186
commit
8ab11f415f
|
@ -36,6 +36,11 @@ import socket
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
try:
|
||||||
|
import faulthandler
|
||||||
|
except ImportError:
|
||||||
|
faulthandler = None
|
||||||
|
|
||||||
import mitogen
|
import mitogen
|
||||||
import mitogen.core
|
import mitogen.core
|
||||||
import mitogen.debug
|
import mitogen.debug
|
||||||
|
@ -70,6 +75,17 @@ def clean_shutdown(sock):
|
||||||
sock.recv(1)
|
sock.recv(1)
|
||||||
|
|
||||||
|
|
||||||
|
def getenv_int(key, default=0):
|
||||||
|
"""
|
||||||
|
Get an integer-valued environment variable `key`, if it exists and parses
|
||||||
|
as an integer, otherwise return `default`.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return int(os.environ.get(key, str(default)))
|
||||||
|
except ValueError:
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
class MuxProcess(object):
|
class MuxProcess(object):
|
||||||
"""
|
"""
|
||||||
Implement a subprocess forked from the Ansible top-level, as a safe place
|
Implement a subprocess forked from the Ansible top-level, as a safe place
|
||||||
|
@ -127,6 +143,9 @@ class MuxProcess(object):
|
||||||
if cls.worker_sock is not None:
|
if cls.worker_sock is not None:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if faulthandler is not None:
|
||||||
|
faulthandler.enable()
|
||||||
|
|
||||||
cls.unix_listener_path = mitogen.unix.make_socket_path()
|
cls.unix_listener_path = mitogen.unix.make_socket_path()
|
||||||
cls.worker_sock, cls.child_sock = socket.socketpair()
|
cls.worker_sock, cls.child_sock = socket.socketpair()
|
||||||
atexit.register(lambda: clean_shutdown(cls.worker_sock))
|
atexit.register(lambda: clean_shutdown(cls.worker_sock))
|
||||||
|
@ -164,6 +183,15 @@ class MuxProcess(object):
|
||||||
# Block until the socket is closed, which happens on parent exit.
|
# Block until the socket is closed, which happens on parent exit.
|
||||||
mitogen.core.io_op(self.child_sock.recv, 1)
|
mitogen.core.io_op(self.child_sock.recv, 1)
|
||||||
|
|
||||||
|
def _enable_router_debug(self):
|
||||||
|
if 'MITOGEN_ROUTER_DEBUG' in os.environ:
|
||||||
|
self.router.enable_debug()
|
||||||
|
|
||||||
|
def _enable_stack_dumps(self):
|
||||||
|
secs = getenv_int('MITOGEN_DUMP_THREAD_STACKS', default=0)
|
||||||
|
if secs:
|
||||||
|
mitogen.debug.dump_to_logger(secs=secs)
|
||||||
|
|
||||||
def _setup_master(self):
|
def _setup_master(self):
|
||||||
"""
|
"""
|
||||||
Construct a Router, Broker, and mitogen.unix listener
|
Construct a Router, Broker, and mitogen.unix listener
|
||||||
|
@ -177,10 +205,8 @@ class MuxProcess(object):
|
||||||
router=self.router,
|
router=self.router,
|
||||||
path=self.unix_listener_path,
|
path=self.unix_listener_path,
|
||||||
)
|
)
|
||||||
if 'MITOGEN_ROUTER_DEBUG' in os.environ:
|
self._enable_router_debug()
|
||||||
self.router.enable_debug()
|
self._enable_stack_dumps()
|
||||||
if 'MITOGEN_DUMP_THREAD_STACKS' in os.environ:
|
|
||||||
mitogen.debug.dump_to_logger()
|
|
||||||
|
|
||||||
def _setup_services(self):
|
def _setup_services(self):
|
||||||
"""
|
"""
|
||||||
|
@ -195,7 +221,7 @@ class MuxProcess(object):
|
||||||
ansible_mitogen.services.ContextService(self.router),
|
ansible_mitogen.services.ContextService(self.router),
|
||||||
ansible_mitogen.services.ModuleDepService(self.router),
|
ansible_mitogen.services.ModuleDepService(self.router),
|
||||||
],
|
],
|
||||||
size=int(os.environ.get('MITOGEN_POOL_SIZE', '16')),
|
size=getenv_int('MITOGEN_POOL_SIZE', default=16),
|
||||||
)
|
)
|
||||||
LOG.debug('Service pool configured: size=%d', self.pool.size)
|
LOG.debug('Service pool configured: size=%d', self.pool.size)
|
||||||
|
|
||||||
|
|
|
@ -832,25 +832,43 @@ except connection delegation is supported.
|
||||||
Debugging
|
Debugging
|
||||||
---------
|
---------
|
||||||
|
|
||||||
Diagnostics and use of the :py:mod:`logging` package output on the target
|
Diagnostics and :py:mod:`logging` package output on targets are usually
|
||||||
machine are usually discarded. With Mitogen, all of this is captured and
|
discarded. With Mitogen, these are captured and forwarded to the controller
|
||||||
returned to the controller, where it can be viewed as desired with ``-vvv``.
|
where they can be viewed with ``-vvv``. Basic high level logs are produced with
|
||||||
Basic high level logs are produced with ``-vvv``, with logging of all IO on the
|
``-vvv``, with logging of all IO on the controller with ``-vvvv`` or higher.
|
||||||
controller with ``-vvvv`` or higher.
|
|
||||||
|
|
||||||
Although use of standard IO and the logging package on the target is forwarded
|
While uncaptured standard IO and the logging package on targets is forwarded,
|
||||||
to the controller, it is not possible to receive IO activity logs, as the
|
it is not possible to receive IO activity logs, as the forwarding process would
|
||||||
process of receiving those logs would would itself generate IO activity. To
|
would itself generate additional IO.
|
||||||
receive a complete trace of every process on every machine, file-based logging
|
|
||||||
is necessary. File-based logging can be enabled by setting
|
|
||||||
``MITOGEN_ROUTER_DEBUG=1`` in your environment.
|
|
||||||
|
|
||||||
When file-based logging is enabled, one file per context will be created on the
|
To receive a complete trace of every process on every machine, file-based
|
||||||
local machine and every target machine, as ``/tmp/mitogen.<pid>.log``.
|
logging is necessary. File-based logging can be enabled by setting
|
||||||
|
``MITOGEN_ROUTER_DEBUG=1`` in your environment. When file-based logging is
|
||||||
|
enabled, one file per context will be created on the local machine and every
|
||||||
|
target machine, as ``/tmp/mitogen.<pid>.log``.
|
||||||
|
|
||||||
If you are experiencing a hang, ``MITOGEN_DUMP_THREAD_STACKS=1`` causes every
|
Diagnosing Hangs
|
||||||
process on every machine to dump every thread stack into the logging framework
|
~~~~~~~~~~~~~~~~
|
||||||
every 5 seconds.
|
|
||||||
|
If you encounter a hang, the ``MITOGEN_DUMP_THREAD_STACKS=<secs>`` environment
|
||||||
|
variable arranges for each process on each machine to dump each thread stack
|
||||||
|
into the logging framework every `secs` seconds, which is visible when running
|
||||||
|
with ``-vvv``.
|
||||||
|
|
||||||
|
However, certain controller hangs may render ``MITOGEN_DUMP_THREAD_STACKS``
|
||||||
|
ineffective, or occur too infrequently for interactive reproduction. In these
|
||||||
|
cases `faulthandler <https://faulthandler.readthedocs.io/>`_ may be used:
|
||||||
|
|
||||||
|
1. For Python 2, ``pip install faulthandler``. This is unnecessary on Python 3.
|
||||||
|
2. Once the hang occurs, observe the process tree using ``pstree`` or ``ps
|
||||||
|
--forest``.
|
||||||
|
3. The most likely process to be hung is the connection multiplexer, which can
|
||||||
|
easily be identified as the parent of all SSH client processes.
|
||||||
|
4. Send ``kill -SEGV <pid>`` to the multiplexer PID, causing it to print all
|
||||||
|
thread stacks.
|
||||||
|
5. `File a bug <https://github.com/dw/mitogen/issues/new/>`_ including a copy
|
||||||
|
of the stacks, along with a description of the last task executing prior to
|
||||||
|
the hang.
|
||||||
|
|
||||||
|
|
||||||
Getting Help
|
Getting Help
|
||||||
|
|
|
@ -183,15 +183,16 @@ def install_handler():
|
||||||
signal.signal(signal.SIGUSR2, _handler)
|
signal.signal(signal.SIGUSR2, _handler)
|
||||||
|
|
||||||
|
|
||||||
def _logging_main():
|
def _logging_main(secs):
|
||||||
while True:
|
while True:
|
||||||
time.sleep(5)
|
time.sleep(secs)
|
||||||
LOG.info('PERIODIC THREAD DUMP\n\n%s', get_snapshot())
|
LOG.info('PERIODIC THREAD DUMP\n\n%s', get_snapshot())
|
||||||
|
|
||||||
|
|
||||||
def dump_to_logger():
|
def dump_to_logger(secs=5):
|
||||||
th = threading.Thread(
|
th = threading.Thread(
|
||||||
target=_logging_main,
|
target=_logging_main,
|
||||||
|
kwargs={'secs': secs},
|
||||||
name='mitogen.debug.dump_to_logger',
|
name='mitogen.debug.dump_to_logger',
|
||||||
)
|
)
|
||||||
th.setDaemon(True)
|
th.setDaemon(True)
|
||||||
|
|
Loading…
Reference in New Issue