ansible: better support for diagnosing hangs

* Always enable the faulthandler module in the top-level process if it
  is available.
* Make MITOGEN_DUMP_THREAD_STACKS interval configurable, to better
  handle larger runs.
* Add docs subsection on diagnosing hangs.

Conflicts:
	ansible_mitogen/process.py
This commit is contained in:
David Wilson 2018-08-24 16:10:46 +01:00
parent e52684c186
commit 8ab11f415f
3 changed files with 69 additions and 24 deletions

View File

@ -36,6 +36,11 @@ import socket
import sys import sys
import time import time
try:
import faulthandler
except ImportError:
faulthandler = None
import mitogen import mitogen
import mitogen.core import mitogen.core
import mitogen.debug import mitogen.debug
@ -70,6 +75,17 @@ def clean_shutdown(sock):
sock.recv(1) sock.recv(1)
def getenv_int(key, default=0):
"""
Get an integer-valued environment variable `key`, if it exists and parses
as an integer, otherwise return `default`.
"""
try:
return int(os.environ.get(key, str(default)))
except ValueError:
return default
class MuxProcess(object): class MuxProcess(object):
""" """
Implement a subprocess forked from the Ansible top-level, as a safe place Implement a subprocess forked from the Ansible top-level, as a safe place
@ -127,6 +143,9 @@ class MuxProcess(object):
if cls.worker_sock is not None: if cls.worker_sock is not None:
return return
if faulthandler is not None:
faulthandler.enable()
cls.unix_listener_path = mitogen.unix.make_socket_path() cls.unix_listener_path = mitogen.unix.make_socket_path()
cls.worker_sock, cls.child_sock = socket.socketpair() cls.worker_sock, cls.child_sock = socket.socketpair()
atexit.register(lambda: clean_shutdown(cls.worker_sock)) atexit.register(lambda: clean_shutdown(cls.worker_sock))
@ -164,6 +183,15 @@ class MuxProcess(object):
# Block until the socket is closed, which happens on parent exit. # Block until the socket is closed, which happens on parent exit.
mitogen.core.io_op(self.child_sock.recv, 1) mitogen.core.io_op(self.child_sock.recv, 1)
def _enable_router_debug(self):
if 'MITOGEN_ROUTER_DEBUG' in os.environ:
self.router.enable_debug()
def _enable_stack_dumps(self):
secs = getenv_int('MITOGEN_DUMP_THREAD_STACKS', default=0)
if secs:
mitogen.debug.dump_to_logger(secs=secs)
def _setup_master(self): def _setup_master(self):
""" """
Construct a Router, Broker, and mitogen.unix listener Construct a Router, Broker, and mitogen.unix listener
@ -177,10 +205,8 @@ class MuxProcess(object):
router=self.router, router=self.router,
path=self.unix_listener_path, path=self.unix_listener_path,
) )
if 'MITOGEN_ROUTER_DEBUG' in os.environ: self._enable_router_debug()
self.router.enable_debug() self._enable_stack_dumps()
if 'MITOGEN_DUMP_THREAD_STACKS' in os.environ:
mitogen.debug.dump_to_logger()
def _setup_services(self): def _setup_services(self):
""" """
@ -195,7 +221,7 @@ class MuxProcess(object):
ansible_mitogen.services.ContextService(self.router), ansible_mitogen.services.ContextService(self.router),
ansible_mitogen.services.ModuleDepService(self.router), ansible_mitogen.services.ModuleDepService(self.router),
], ],
size=int(os.environ.get('MITOGEN_POOL_SIZE', '16')), size=getenv_int('MITOGEN_POOL_SIZE', default=16),
) )
LOG.debug('Service pool configured: size=%d', self.pool.size) LOG.debug('Service pool configured: size=%d', self.pool.size)

View File

@ -832,25 +832,43 @@ except connection delegation is supported.
Debugging Debugging
--------- ---------
Diagnostics and use of the :py:mod:`logging` package output on the target Diagnostics and :py:mod:`logging` package output on targets are usually
machine are usually discarded. With Mitogen, all of this is captured and discarded. With Mitogen, these are captured and forwarded to the controller
returned to the controller, where it can be viewed as desired with ``-vvv``. where they can be viewed with ``-vvv``. Basic high level logs are produced with
Basic high level logs are produced with ``-vvv``, with logging of all IO on the ``-vvv``, with logging of all IO on the controller with ``-vvvv`` or higher.
controller with ``-vvvv`` or higher.
Although use of standard IO and the logging package on the target is forwarded While uncaptured standard IO and the logging package on targets is forwarded,
to the controller, it is not possible to receive IO activity logs, as the it is not possible to receive IO activity logs, as the forwarding process would
process of receiving those logs would would itself generate IO activity. To would itself generate additional IO.
receive a complete trace of every process on every machine, file-based logging
is necessary. File-based logging can be enabled by setting
``MITOGEN_ROUTER_DEBUG=1`` in your environment.
When file-based logging is enabled, one file per context will be created on the To receive a complete trace of every process on every machine, file-based
local machine and every target machine, as ``/tmp/mitogen.<pid>.log``. logging is necessary. File-based logging can be enabled by setting
``MITOGEN_ROUTER_DEBUG=1`` in your environment. When file-based logging is
enabled, one file per context will be created on the local machine and every
target machine, as ``/tmp/mitogen.<pid>.log``.
If you are experiencing a hang, ``MITOGEN_DUMP_THREAD_STACKS=1`` causes every Diagnosing Hangs
process on every machine to dump every thread stack into the logging framework ~~~~~~~~~~~~~~~~
every 5 seconds.
If you encounter a hang, the ``MITOGEN_DUMP_THREAD_STACKS=<secs>`` environment
variable arranges for each process on each machine to dump each thread stack
into the logging framework every `secs` seconds, which is visible when running
with ``-vvv``.
However, certain controller hangs may render ``MITOGEN_DUMP_THREAD_STACKS``
ineffective, or occur too infrequently for interactive reproduction. In these
cases `faulthandler <https://faulthandler.readthedocs.io/>`_ may be used:
1. For Python 2, ``pip install faulthandler``. This is unnecessary on Python 3.
2. Once the hang occurs, observe the process tree using ``pstree`` or ``ps
--forest``.
3. The most likely process to be hung is the connection multiplexer, which can
easily be identified as the parent of all SSH client processes.
4. Send ``kill -SEGV <pid>`` to the multiplexer PID, causing it to print all
thread stacks.
5. `File a bug <https://github.com/dw/mitogen/issues/new/>`_ including a copy
of the stacks, along with a description of the last task executing prior to
the hang.
Getting Help Getting Help

View File

@ -183,15 +183,16 @@ def install_handler():
signal.signal(signal.SIGUSR2, _handler) signal.signal(signal.SIGUSR2, _handler)
def _logging_main(): def _logging_main(secs):
while True: while True:
time.sleep(5) time.sleep(secs)
LOG.info('PERIODIC THREAD DUMP\n\n%s', get_snapshot()) LOG.info('PERIODIC THREAD DUMP\n\n%s', get_snapshot())
def dump_to_logger(): def dump_to_logger(secs=5):
th = threading.Thread( th = threading.Thread(
target=_logging_main, target=_logging_main,
kwargs={'secs': secs},
name='mitogen.debug.dump_to_logger', name='mitogen.debug.dump_to_logger',
) )
th.setDaemon(True) th.setDaemon(True)