From 8ab11f415f7a2bba4b5b8a1687e76814301bf614 Mon Sep 17 00:00:00 2001 From: David Wilson Date: Fri, 24 Aug 2018 16:10:46 +0100 Subject: [PATCH] ansible: better support for diagnosing hangs * Always enable the faulthandler module in the top-level process if it is available. * Make MITOGEN_DUMP_THREAD_STACKS interval configurable, to better handle larger runs. * Add docs subsection on diagnosing hangs. Conflicts: ansible_mitogen/process.py --- ansible_mitogen/process.py | 36 +++++++++++++++++++++++---- docs/ansible.rst | 50 ++++++++++++++++++++++++++------------ mitogen/debug.py | 7 +++--- 3 files changed, 69 insertions(+), 24 deletions(-) diff --git a/ansible_mitogen/process.py b/ansible_mitogen/process.py index e7ceb2ce..c5e4c016 100644 --- a/ansible_mitogen/process.py +++ b/ansible_mitogen/process.py @@ -36,6 +36,11 @@ import socket import sys import time +try: + import faulthandler +except ImportError: + faulthandler = None + import mitogen import mitogen.core import mitogen.debug @@ -70,6 +75,17 @@ def clean_shutdown(sock): sock.recv(1) +def getenv_int(key, default=0): + """ + Get an integer-valued environment variable `key`, if it exists and parses + as an integer, otherwise return `default`. + """ + try: + return int(os.environ.get(key, str(default))) + except ValueError: + return default + + class MuxProcess(object): """ Implement a subprocess forked from the Ansible top-level, as a safe place @@ -127,6 +143,9 @@ class MuxProcess(object): if cls.worker_sock is not None: return + if faulthandler is not None: + faulthandler.enable() + cls.unix_listener_path = mitogen.unix.make_socket_path() cls.worker_sock, cls.child_sock = socket.socketpair() atexit.register(lambda: clean_shutdown(cls.worker_sock)) @@ -164,6 +183,15 @@ class MuxProcess(object): # Block until the socket is closed, which happens on parent exit. mitogen.core.io_op(self.child_sock.recv, 1) + def _enable_router_debug(self): + if 'MITOGEN_ROUTER_DEBUG' in os.environ: + self.router.enable_debug() + + def _enable_stack_dumps(self): + secs = getenv_int('MITOGEN_DUMP_THREAD_STACKS', default=0) + if secs: + mitogen.debug.dump_to_logger(secs=secs) + def _setup_master(self): """ Construct a Router, Broker, and mitogen.unix listener @@ -177,10 +205,8 @@ class MuxProcess(object): router=self.router, path=self.unix_listener_path, ) - if 'MITOGEN_ROUTER_DEBUG' in os.environ: - self.router.enable_debug() - if 'MITOGEN_DUMP_THREAD_STACKS' in os.environ: - mitogen.debug.dump_to_logger() + self._enable_router_debug() + self._enable_stack_dumps() def _setup_services(self): """ @@ -195,7 +221,7 @@ class MuxProcess(object): ansible_mitogen.services.ContextService(self.router), ansible_mitogen.services.ModuleDepService(self.router), ], - size=int(os.environ.get('MITOGEN_POOL_SIZE', '16')), + size=getenv_int('MITOGEN_POOL_SIZE', default=16), ) LOG.debug('Service pool configured: size=%d', self.pool.size) diff --git a/docs/ansible.rst b/docs/ansible.rst index 514708bd..14f6e553 100644 --- a/docs/ansible.rst +++ b/docs/ansible.rst @@ -832,25 +832,43 @@ except connection delegation is supported. Debugging --------- -Diagnostics and use of the :py:mod:`logging` package output on the target -machine are usually discarded. With Mitogen, all of this is captured and -returned to the controller, where it can be viewed as desired with ``-vvv``. -Basic high level logs are produced with ``-vvv``, with logging of all IO on the -controller with ``-vvvv`` or higher. +Diagnostics and :py:mod:`logging` package output on targets are usually +discarded. With Mitogen, these are captured and forwarded to the controller +where they can be viewed with ``-vvv``. Basic high level logs are produced with +``-vvv``, with logging of all IO on the controller with ``-vvvv`` or higher. -Although use of standard IO and the logging package on the target is forwarded -to the controller, it is not possible to receive IO activity logs, as the -process of receiving those logs would would itself generate IO activity. To -receive a complete trace of every process on every machine, file-based logging -is necessary. File-based logging can be enabled by setting -``MITOGEN_ROUTER_DEBUG=1`` in your environment. +While uncaptured standard IO and the logging package on targets is forwarded, +it is not possible to receive IO activity logs, as the forwarding process would +would itself generate additional IO. -When file-based logging is enabled, one file per context will be created on the -local machine and every target machine, as ``/tmp/mitogen..log``. +To receive a complete trace of every process on every machine, file-based +logging is necessary. File-based logging can be enabled by setting +``MITOGEN_ROUTER_DEBUG=1`` in your environment. When file-based logging is +enabled, one file per context will be created on the local machine and every +target machine, as ``/tmp/mitogen..log``. -If you are experiencing a hang, ``MITOGEN_DUMP_THREAD_STACKS=1`` causes every -process on every machine to dump every thread stack into the logging framework -every 5 seconds. +Diagnosing Hangs +~~~~~~~~~~~~~~~~ + +If you encounter a hang, the ``MITOGEN_DUMP_THREAD_STACKS=`` environment +variable arranges for each process on each machine to dump each thread stack +into the logging framework every `secs` seconds, which is visible when running +with ``-vvv``. + +However, certain controller hangs may render ``MITOGEN_DUMP_THREAD_STACKS`` +ineffective, or occur too infrequently for interactive reproduction. In these +cases `faulthandler `_ may be used: + +1. For Python 2, ``pip install faulthandler``. This is unnecessary on Python 3. +2. Once the hang occurs, observe the process tree using ``pstree`` or ``ps + --forest``. +3. The most likely process to be hung is the connection multiplexer, which can + easily be identified as the parent of all SSH client processes. +4. Send ``kill -SEGV `` to the multiplexer PID, causing it to print all + thread stacks. +5. `File a bug `_ including a copy + of the stacks, along with a description of the last task executing prior to + the hang. Getting Help diff --git a/mitogen/debug.py b/mitogen/debug.py index 64d2292d..19cf1a89 100644 --- a/mitogen/debug.py +++ b/mitogen/debug.py @@ -183,15 +183,16 @@ def install_handler(): signal.signal(signal.SIGUSR2, _handler) -def _logging_main(): +def _logging_main(secs): while True: - time.sleep(5) + time.sleep(secs) LOG.info('PERIODIC THREAD DUMP\n\n%s', get_snapshot()) -def dump_to_logger(): +def dump_to_logger(secs=5): th = threading.Thread( target=_logging_main, + kwargs={'secs': secs}, name='mitogen.debug.dump_to_logger', ) th.setDaemon(True)