issue #19: second attempt at import scanner

This version is based on the modulefinder standard library module,
pruned back just to handle modules we know have been loaded already, and
to scan module-level imports only, rather than imports occurring in
class and function scope (crappy heuristic, but assume they are lazy
imports).

The ast and compiler modules were far too slow, whereas this version can
bytecode compile and scan all the imports for django.db.models (58
modules) in around 200ms.. 3.4ms per dependency, it's probably not going
to get much faster than that.
This commit is contained in:
David Wilson 2017-09-23 17:12:18 +05:30
parent 40e2e6eb4c
commit 43ccbf0459
1 changed files with 73 additions and 53 deletions

View File

@ -1,4 +1,5 @@
import commands import commands
import dis
import errno import errno
import getpass import getpass
import imp import imp
@ -20,14 +21,6 @@ import time
import types import types
import zlib import zlib
try:
import ast
except ImportError:
# ast module is not available in Python 2.4.x, instead we shall use the the
# compiler module as a fallback
ast = None
import compiler
if not hasattr(pkgutil, 'find_loader'): if not hasattr(pkgutil, 'find_loader'):
# find_loader() was new in >=2.5, but the modern pkgutil.py syntax has # find_loader() was new in >=2.5, but the modern pkgutil.py syntax has
# been kept intentionally 2.3 compatible so we can reuse it. # been kept intentionally 2.3 compatible so we can reuse it.
@ -198,6 +191,42 @@ def discard_until(fd, s, deadline):
return return
def scan_code_imports(co, LOAD_CONST=dis.opname.index('LOAD_CONST'),
IMPORT_NAME=dis.opname.index('IMPORT_NAME')):
"""Given a code object `co`, scan its bytecode yielding any
``IMPORT_NAME`` and associated prior ``LOAD_CONST`` instructions
representing an `Import` statement or `ImportFrom` statement.
:return:
Generator producing `(level, modname, namelist)` tuples, where:
* `level`: -1 for normal import, 0, for absolute import, and >0 for
relative import.
* `modname`: Name of module to import, or from where `namelist` names
are imported.
* `namelist`: for `ImportFrom`, the list of names to be imported from
`modname`.
"""
# Yield `(op, oparg)` tuples from the code object `co`.
ordit = itertools.imap(ord, co.co_code)
nextb = ordit.next
opit = ((c, (None
if c < dis.HAVE_ARGUMENT else
(nextb() | (nextb() << 8))))
for c in ordit)
for oparg1, oparg2, (op3, arg3) in itertools.izip(opit, opit, opit):
if op3 == IMPORT_NAME:
op2, arg2 = oparg2
op1, arg1 = oparg1
if op1 == op2 == LOAD_CONST:
yield (
co.co_consts[arg1],
co.co_names[arg3],
co.co_consts[arg2] or (),
)
class LogForwarder(object): class LogForwarder(object):
def __init__(self, router): def __init__(self, router):
self._router = router self._router = router
@ -239,8 +268,7 @@ class ModuleFinder(object):
#: results around. #: results around.
self._found_cache = {} self._found_cache = {}
#: Avoid repeated AST parsing, which is extremely expensive with #: Avoid repeated dependency scanning, which is expensive.
#: py:mod:`compiler` module.
self._related_cache = {} self._related_cache = {}
def __repr__(self): def __repr__(self):
@ -252,12 +280,17 @@ class ModuleFinder(object):
if imp.is_builtin(modname) != 0: if imp.is_builtin(modname) != 0:
return True return True
module = sys.modules[modname] module = sys.modules.get(modname)
if module is None: if module is None:
return False return False
# six installs crap with no __file__
modpath = getattr(module, '__file__', '')
if 'site-packages' in modpath:
return False
for dirname in self.STDLIB_DIRS: for dirname in self.STDLIB_DIRS:
if os.path.commonprefix((dirname, module.__file__)) == dirname: if os.path.commonprefix((dirname, modpath)) == dirname:
return True return True
return False return False
@ -285,6 +318,10 @@ class ModuleFinder(object):
LOG.debug('%r does not appear in sys.modules', fullname) LOG.debug('%r does not appear in sys.modules', fullname)
return return
if 'six.moves' in fullname:
# TODO: causes inspect.getsource() to explode.
return
is_pkg = hasattr(sys.modules[fullname], '__path__') is_pkg = hasattr(sys.modules[fullname], '__path__')
try: try:
source = inspect.getsource(sys.modules[fullname]) source = inspect.getsource(sys.modules[fullname])
@ -351,34 +388,10 @@ class ModuleFinder(object):
return '.'.join(bits[:-level]) + '.' return '.'.join(bits[:-level]) + '.'
def _ast_walk(self, fullname, src): def generate_parent_names(self, fullname):
for node in ast.walk(ast.parse(src)): while '.' in fullname:
if isinstance(node, ast.Import): fullname = fullname[:fullname.rindex('.')]
for alias in node.names: yield fullname
yield alias.name
elif isinstance(node, ast.ImportFrom):
prefix = self.resolve_relpath(fullname, node.level)
for alias in node.names:
yield prefix + alias.name
def _compiler_visit(self, tree):
# TODO: this is insanely slow, need to prune the tree somehow, but it's
# only for Mitogen masters on ancient Pythons anyway.
stack = [tree]
while stack:
node = stack.pop(0)
yield node
stack.extend(node.getChildNodes())
def _compiler_walk(self, fullname, src):
for node in self._compiler_visit(compiler.parse(src)):
if isinstance(node, compiler.ast.Import):
for name, _ in node.names:
yield name
elif isinstance(node, compiler.ast.From):
prefix = self.resolve_relpath(fullname, node.level)
for name, _ in node.names:
yield prefix + name
def find_related_imports(self, fullname): def find_related_imports(self, fullname):
""" """
@ -391,26 +404,33 @@ class ModuleFinder(object):
if related is not None: if related is not None:
return related return related
_, src, _ = self.get_module_source(fullname) modpath, src, _ = self.get_module_source(fullname)
if src is None: if src is None:
LOG.warning('%r: cannot find source for %r', self, fullname) LOG.warning('%r: cannot find source for %r', self, fullname)
return [] return []
prefixes = [''] maybe_names = list(self.generate_parent_names(fullname))
if '.' in fullname:
prefixes.append(fullname.rsplit('.', 1)[0] + '.')
if ast: co = compile(src, modpath, 'exec')
walker = self._ast_walk for level, modname, namelist in scan_code_imports(co):
if level == -1:
modnames = [modname, '%s.%s' % (fullname, modname)]
else: else:
walker = self._compiler_walk modnames = [self.resolve_relpath(fullname, level) + modname]
maybe_names.extend(modnames)
maybe_names.extend(
'%s.%s' % (mname, name)
for mname in modnames
for name in namelist
)
return self._related_cache.setdefault(fullname, [ return self._related_cache.setdefault(fullname, [
prefix + name name
for prefix in prefixes for name in maybe_names
for name in walker(fullname, src) if sys.modules.get(name) is not None
if sys.modules.get(prefix + name) is not None and not self.is_stdlib_name(name)
and not self.is_stdlib_name(prefix + name) and 'six.moves' not in name # TODO: crap
]) ])
def find_related(self, fullname): def find_related(self, fullname):