ansible: better support for diagnosing hangs
* Always enable the faulthandler module in the top-level process if it is available. * Make MITOGEN_DUMP_THREAD_STACKS interval configurable, to better handle larger runs. * Add docs subsection on diagnosing hangs. Conflicts: ansible_mitogen/process.py
This commit is contained in:
parent
e52684c186
commit
8ab11f415f
|
@ -36,6 +36,11 @@ import socket
|
|||
import sys
|
||||
import time
|
||||
|
||||
try:
|
||||
import faulthandler
|
||||
except ImportError:
|
||||
faulthandler = None
|
||||
|
||||
import mitogen
|
||||
import mitogen.core
|
||||
import mitogen.debug
|
||||
|
@ -70,6 +75,17 @@ def clean_shutdown(sock):
|
|||
sock.recv(1)
|
||||
|
||||
|
||||
def getenv_int(key, default=0):
|
||||
"""
|
||||
Get an integer-valued environment variable `key`, if it exists and parses
|
||||
as an integer, otherwise return `default`.
|
||||
"""
|
||||
try:
|
||||
return int(os.environ.get(key, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
class MuxProcess(object):
|
||||
"""
|
||||
Implement a subprocess forked from the Ansible top-level, as a safe place
|
||||
|
@ -127,6 +143,9 @@ class MuxProcess(object):
|
|||
if cls.worker_sock is not None:
|
||||
return
|
||||
|
||||
if faulthandler is not None:
|
||||
faulthandler.enable()
|
||||
|
||||
cls.unix_listener_path = mitogen.unix.make_socket_path()
|
||||
cls.worker_sock, cls.child_sock = socket.socketpair()
|
||||
atexit.register(lambda: clean_shutdown(cls.worker_sock))
|
||||
|
@ -164,6 +183,15 @@ class MuxProcess(object):
|
|||
# Block until the socket is closed, which happens on parent exit.
|
||||
mitogen.core.io_op(self.child_sock.recv, 1)
|
||||
|
||||
def _enable_router_debug(self):
|
||||
if 'MITOGEN_ROUTER_DEBUG' in os.environ:
|
||||
self.router.enable_debug()
|
||||
|
||||
def _enable_stack_dumps(self):
|
||||
secs = getenv_int('MITOGEN_DUMP_THREAD_STACKS', default=0)
|
||||
if secs:
|
||||
mitogen.debug.dump_to_logger(secs=secs)
|
||||
|
||||
def _setup_master(self):
|
||||
"""
|
||||
Construct a Router, Broker, and mitogen.unix listener
|
||||
|
@ -177,10 +205,8 @@ class MuxProcess(object):
|
|||
router=self.router,
|
||||
path=self.unix_listener_path,
|
||||
)
|
||||
if 'MITOGEN_ROUTER_DEBUG' in os.environ:
|
||||
self.router.enable_debug()
|
||||
if 'MITOGEN_DUMP_THREAD_STACKS' in os.environ:
|
||||
mitogen.debug.dump_to_logger()
|
||||
self._enable_router_debug()
|
||||
self._enable_stack_dumps()
|
||||
|
||||
def _setup_services(self):
|
||||
"""
|
||||
|
@ -195,7 +221,7 @@ class MuxProcess(object):
|
|||
ansible_mitogen.services.ContextService(self.router),
|
||||
ansible_mitogen.services.ModuleDepService(self.router),
|
||||
],
|
||||
size=int(os.environ.get('MITOGEN_POOL_SIZE', '16')),
|
||||
size=getenv_int('MITOGEN_POOL_SIZE', default=16),
|
||||
)
|
||||
LOG.debug('Service pool configured: size=%d', self.pool.size)
|
||||
|
||||
|
|
|
@ -832,25 +832,43 @@ except connection delegation is supported.
|
|||
Debugging
|
||||
---------
|
||||
|
||||
Diagnostics and use of the :py:mod:`logging` package output on the target
|
||||
machine are usually discarded. With Mitogen, all of this is captured and
|
||||
returned to the controller, where it can be viewed as desired with ``-vvv``.
|
||||
Basic high level logs are produced with ``-vvv``, with logging of all IO on the
|
||||
controller with ``-vvvv`` or higher.
|
||||
Diagnostics and :py:mod:`logging` package output on targets are usually
|
||||
discarded. With Mitogen, these are captured and forwarded to the controller
|
||||
where they can be viewed with ``-vvv``. Basic high level logs are produced with
|
||||
``-vvv``, with logging of all IO on the controller with ``-vvvv`` or higher.
|
||||
|
||||
Although use of standard IO and the logging package on the target is forwarded
|
||||
to the controller, it is not possible to receive IO activity logs, as the
|
||||
process of receiving those logs would would itself generate IO activity. To
|
||||
receive a complete trace of every process on every machine, file-based logging
|
||||
is necessary. File-based logging can be enabled by setting
|
||||
``MITOGEN_ROUTER_DEBUG=1`` in your environment.
|
||||
While uncaptured standard IO and the logging package on targets is forwarded,
|
||||
it is not possible to receive IO activity logs, as the forwarding process would
|
||||
would itself generate additional IO.
|
||||
|
||||
When file-based logging is enabled, one file per context will be created on the
|
||||
local machine and every target machine, as ``/tmp/mitogen.<pid>.log``.
|
||||
To receive a complete trace of every process on every machine, file-based
|
||||
logging is necessary. File-based logging can be enabled by setting
|
||||
``MITOGEN_ROUTER_DEBUG=1`` in your environment. When file-based logging is
|
||||
enabled, one file per context will be created on the local machine and every
|
||||
target machine, as ``/tmp/mitogen.<pid>.log``.
|
||||
|
||||
If you are experiencing a hang, ``MITOGEN_DUMP_THREAD_STACKS=1`` causes every
|
||||
process on every machine to dump every thread stack into the logging framework
|
||||
every 5 seconds.
|
||||
Diagnosing Hangs
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
If you encounter a hang, the ``MITOGEN_DUMP_THREAD_STACKS=<secs>`` environment
|
||||
variable arranges for each process on each machine to dump each thread stack
|
||||
into the logging framework every `secs` seconds, which is visible when running
|
||||
with ``-vvv``.
|
||||
|
||||
However, certain controller hangs may render ``MITOGEN_DUMP_THREAD_STACKS``
|
||||
ineffective, or occur too infrequently for interactive reproduction. In these
|
||||
cases `faulthandler <https://faulthandler.readthedocs.io/>`_ may be used:
|
||||
|
||||
1. For Python 2, ``pip install faulthandler``. This is unnecessary on Python 3.
|
||||
2. Once the hang occurs, observe the process tree using ``pstree`` or ``ps
|
||||
--forest``.
|
||||
3. The most likely process to be hung is the connection multiplexer, which can
|
||||
easily be identified as the parent of all SSH client processes.
|
||||
4. Send ``kill -SEGV <pid>`` to the multiplexer PID, causing it to print all
|
||||
thread stacks.
|
||||
5. `File a bug <https://github.com/dw/mitogen/issues/new/>`_ including a copy
|
||||
of the stacks, along with a description of the last task executing prior to
|
||||
the hang.
|
||||
|
||||
|
||||
Getting Help
|
||||
|
|
|
@ -183,15 +183,16 @@ def install_handler():
|
|||
signal.signal(signal.SIGUSR2, _handler)
|
||||
|
||||
|
||||
def _logging_main():
|
||||
def _logging_main(secs):
|
||||
while True:
|
||||
time.sleep(5)
|
||||
time.sleep(secs)
|
||||
LOG.info('PERIODIC THREAD DUMP\n\n%s', get_snapshot())
|
||||
|
||||
|
||||
def dump_to_logger():
|
||||
def dump_to_logger(secs=5):
|
||||
th = threading.Thread(
|
||||
target=_logging_main,
|
||||
kwargs={'secs': secs},
|
||||
name='mitogen.debug.dump_to_logger',
|
||||
)
|
||||
th.setDaemon(True)
|
||||
|
|
Loading…
Reference in New Issue