ansible: new multiplexer/workers configuration

Following on from 152effc26c9a5918cb7ead7a97fe7fa7f81b6764,

* Pin mux to CPU 0
* Pin top-level CPU 1
* Pin workers sequentially to CPU 2..n

Nets 19.5% improvement on issue_140__thread_pileup.yml when targetting
64 Docker containers on the same 8 core/16 thread machine.

Before (prior to last scheme, no affinity at all):

    2294528.731458      task-clock (msec)         #    6.443 CPUs utilized
        10,429,745      context-switches          #    0.005 M/sec
         2,049,618      cpu-migrations            #    0.893 K/sec
         8,258,952      page-faults               #    0.004 M/sec
 5,532,719,253,824      cycles                    #    2.411 GHz                      (83.35%)
 3,267,471,616,230      instructions              #    0.59  insn per cycle
                                                  #    1.22  stalled cycles per insn  (83.35%)
   662,006,455,943      branches                  #  288.515 M/sec                    (83.33%)
    39,453,895,977      branch-misses             #    5.96% of all branches          (83.37%)

     356.148064576 seconds time elapsed

After:

    2226463.958975      task-clock (msec)         #    7.784 CPUs utilized
         9,831,466      context-switches          #    0.004 M/sec
           180,065      cpu-migrations            #    0.081 K/sec
         5,082,278      page-faults               #    0.002 M/sec
 5,592,548,587,259      cycles                    #    2.512 GHz                      (83.35%)
 3,135,038,855,414      instructions              #    0.56  insn per cycle
                                                  #    1.32  stalled cycles per insn  (83.32%)
   636,397,509,232      branches                  #  285.833 M/sec                    (83.30%)
    39,135,441,790      branch-misses             #    6.15% of all branches          (83.35%)

     286.036681644 seconds time elapsed
This commit is contained in:
David Wilson 2019-01-30 03:57:55 +00:00
parent 8f6e6b3940
commit c6d5aa29ba
4 changed files with 137 additions and 53 deletions

132
ansible_mitogen/affinity.py Normal file
View File

@ -0,0 +1,132 @@
# Copyright 2017, David Wilson
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
# may be used to endorse or promote products derived from this software without
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import ctypes
import mmap
import multiprocessing
import os
import struct
import mitogen.parent
try:
_libc = ctypes.CDLL(None, use_errno=True)
_strerror = _libc.strerror
_strerror.restype = ctypes.c_char_p
_pthread_mutex_init = _libc.pthread_mutex_init
_pthread_mutex_lock = _libc.pthread_mutex_lock
_pthread_mutex_unlock = _libc.pthread_mutex_unlock
_sched_setaffinity = _libc.sched_setaffinity
except (OSError, AttributeError):
_libc = None
class pthread_mutex_t(ctypes.Structure):
_fields_ = [
('data', ctypes.c_uint8 * 512),
]
def init(self):
if _pthread_mutex_init(self.data, 0):
raise Exception(_strerror(ctypes.get_errno()))
def acquire(self):
if _pthread_mutex_lock(self.data):
raise Exception(_strerror(ctypes.get_errno()))
def release(self):
if _pthread_mutex_unlock(self.data):
raise Exception(_strerror(ctypes.get_errno()))
class State(ctypes.Structure):
_fields_ = [
('lock', pthread_mutex_t),
('counter', ctypes.c_uint8),
]
class Manager(object):
"""
Bind this process to a randomly selected CPU. If done prior to starting
threads, all threads will be bound to the same CPU. This call is a no-op on
systems other than Linux.
A hook is installed that causes `reset_affinity(clear=True)` to run in the
child of any process created with :func:`mitogen.parent.detach_popen`,
ensuring CPU-intensive children like SSH are not forced to share the same
core as the (otherwise potentially very busy) parent.
Threads bound to the same CPU share cache and experience the lowest
possible inter-thread roundtrip latency, for example ensuring the minimum
possible time required for :class:`mitogen.service.Pool` to interact with
:class:`mitogen.core.Broker`, as required for every message transmitted or
received.
Binding threads of a Python process to one CPU makes sense, as they are
otherwise unable to operate in parallel, and all must acquire the same lock
prior to executing.
"""
def __init__(self):
self.mem = mmap.mmap(-1, 4096)
self.state = State.from_buffer(self.mem)
self.state.lock.init()
def _set_affinity(self, mask):
mitogen.parent._preexec_hook = self.clear
s = struct.pack('L', mask)
_sched_setaffinity(os.getpid(), len(s), s)
def cpu_count(self):
return multiprocessing.cpu_count()
def clear(self):
"""
Clear any prior binding, except for reserved CPUs.
"""
self._set_affinity(0xffffffff & ~3)
def set_cpu(self, cpu):
"""
Bind to 0-based `cpu`.
"""
self._set_affinity(1 << cpu)
def assign(self):
self.state.lock.acquire()
try:
n = self.state.counter
self.state.counter += 1
finally:
self.state.lock.release()
self.set_cpu(2 + (n % (self.cpu_count() - 2)))
manager = Manager()

View File

@ -56,6 +56,7 @@ import ansible_mitogen.logging
import ansible_mitogen.services
from mitogen.core import b
import ansible_mitogen.affinity
LOG = logging.getLogger(__name__)
@ -172,11 +173,12 @@ class MuxProcess(object):
if _init_logging:
ansible_mitogen.logging.setup()
if cls.child_pid:
ansible_mitogen.affinity.manager.set_cpu(1)
cls.child_sock.close()
cls.child_sock = None
mitogen.core.io_op(cls.worker_sock.recv, 1)
else:
mitogen.utils.reset_affinity()
ansible_mitogen.affinity.manager.set_cpu(0)
cls.worker_sock.close()
cls.worker_sock = None
self = cls()

View File

@ -31,6 +31,7 @@ import os
import threading
import mitogen.core
import ansible_mitogen.affinity
import ansible_mitogen.loaders
import ansible_mitogen.mixins
import ansible_mitogen.process
@ -105,6 +106,7 @@ def wrap_worker__run(*args, **kwargs):
import signal
signal.signal(signal.SIGTERM, signal.SIG_IGN)
ansible_mitogen.affinity.manager.assign()
return mitogen.core._profile_hook('WorkerProcess',
lambda: worker__run(*args, **kwargs)
)

View File

@ -28,17 +28,9 @@
import datetime
import logging
import multiprocessing
import os
import random
import struct
import sys
try:
import ctypes
except ImportError:
ctypes = None
import mitogen
import mitogen.core
import mitogen.master
@ -53,50 +45,6 @@ if mitogen.core.PY3:
else:
iteritems = dict.iteritems
if ctypes:
try:
_libc = ctypes.CDLL(None)
_sched_setaffinity = _libc.sched_setaffinity
except (OSError, AttributeError):
_sched_setaffinity = None
def reset_affinity(clear=False):
"""
Bind this process to a randomly selected CPU. If done prior to starting
threads, all threads will be bound to the same CPU. This call is a no-op on
systems other than Linux.
:param bool clear:
If :data:`True`, clear any prior binding.
A hook is installed that causes `reset_affinity(clear=True)` to run in the
child of any process created with :func:`mitogen.parent.detach_popen`,
ensuring CPU-intensive children like SSH are not forced to share the same
core as the (otherwise potentially very busy) parent.
Threads bound to the same CPU share cache and experience the lowest
possible inter-thread roundtrip latency, for example ensuring the minimum
possible time required for :class:`mitogen.service.Pool` to interact with
:class:`mitogen.core.Broker`, as required for every message transmitted or
received.
Binding threads of a Python process to one CPU makes sense, as they are
otherwise unable to operate in parallel, and all must acquire the same lock
prior to executing.
"""
if _sched_setaffinity is None:
return
if clear:
mask = 0xffffffff
else:
mask = 1 << random.randint(0, multiprocessing.cpu_count() - 1)
s = struct.pack('L', mask)
_sched_setaffinity(os.getpid(), len(s), s)
mitogen.parent._preexec_hook = lambda: reset_affinity(clear=True)
def setup_gil():
"""