diff --git a/bin/cythonize.py b/bin/cythonize.py index 2c18ae8db..ca039d81a 100755 --- a/bin/cythonize.py +++ b/bin/cythonize.py @@ -1,62 +1,50 @@ #!/usr/bin/env python -""" cythonize +""" cythonize.py -Cythonize pyx files into C files as needed. +Cythonize pyx files into C++ files as needed. -Usage: cythonize [root_dir] - -Default [root_dir] is 'spacy'. +Usage: cythonize.py [root] Checks pyx files to see if they have been changed relative to their -corresponding C files. If they have, then runs cython on these files to -recreate the C files. +corresponding C++ files. If they have, then runs cython on these files to +recreate the C++ files. -The script thinks that the pyx files have changed relative to the C files -by comparing hashes stored in a database file. +Additionally, checks pxd files and setup.py if they have been changed. If +they have, rebuilds everything. -Simple script to invoke Cython (and Tempita) on all .pyx (.pyx.in) -files; while waiting for a proper build system. Uses file hashes to -figure out if rebuild is needed. +Change detection based on file hashes stored in JSON format. For now, this script should be run by developers when changing Cython files -only, and the resulting C files checked in, so that end-users (and Python-only -developers) do not get the Cython/Tempita dependencies. +and the resulting C++ files checked in, so that end-users (and Python-only +developers) do not get the Cython dependencies. -Originally written by Dag Sverre Seljebotn, and copied here from: +Based upon: https://raw.github.com/dagss/private-scipy-refactor/cythonize/cythonize.py +https://raw.githubusercontent.com/numpy/numpy/master/tools/cythonize.py -Note: this script does not check any of the dependent C libraries; it only -operates on the Cython .pyx files. +Note: this script does not check any of the dependent C++ libraries. """ - -from __future__ import division, print_function, absolute_import +from __future__ import print_function import os -import re import sys +import json import hashlib import subprocess +import argparse -HASH_FILE = 'cythonize.dat' -DEFAULT_ROOT = 'spacy' -VENDOR = 'spaCy' -# WindowsError is not defined on unix systems -try: - WindowsError -except NameError: - WindowsError = None +HASH_FILE = 'cythonize.json' + -# -# Rules -# def process_pyx(fromfile, tofile): + print('Processing %s' % fromfile) try: from Cython.Compiler.Version import version as cython_version from distutils.version import LooseVersion if LooseVersion(cython_version) < LooseVersion('0.19'): - raise Exception('Building %s requires Cython >= 0.19' % VENDOR) + raise Exception('Require Cython >= 0.19') except ImportError: pass @@ -67,133 +55,101 @@ def process_pyx(fromfile, tofile): try: try: - r = subprocess.call(['cython'] + flags + ["-o", tofile, fromfile]) + r = subprocess.call(['cython'] + flags + ['-o', tofile, fromfile]) if r != 0: raise Exception('Cython failed') except OSError: # There are ways of installing Cython that don't result in a cython # executable on the path, see gh-2397. r = subprocess.call([sys.executable, '-c', - 'import sys; from Cython.Compiler.Main import ' - 'setuptools_main as main; sys.exit(main())'] + flags + - ["-o", tofile, fromfile]) + 'import sys; from Cython.Compiler.Main import ' + 'setuptools_main as main; sys.exit(main())'] + flags + + ['-o', tofile, fromfile]) if r != 0: raise Exception('Cython failed') except OSError: raise OSError('Cython needs to be installed') -def process_tempita_pyx(fromfile, tofile): - try: - try: - from Cython import Tempita as tempita - except ImportError: - import tempita - except ImportError: - raise Exception('Building %s requires Tempita: ' - 'pip install --user Tempita' % VENDOR) - with open(fromfile, "r") as f: - tmpl = f.read() - pyxcontent = tempita.sub(tmpl) - assert fromfile.endswith('.pyx.in') - pyxfile = fromfile[:-len('.pyx.in')] + '.pyx' - with open(pyxfile, "w") as f: - f.write(pyxcontent) - process_pyx(pyxfile, tofile) - -rules = { - # fromext : function - '.pyx' : process_pyx, - '.pyx.in' : process_tempita_pyx - } -# -# Hash db -# -def load_hashes(filename): - # Return { filename : (sha1 of input, sha1 of output) } - if os.path.isfile(filename): - hashes = {} - with open(filename, 'r') as f: - for line in f: - filename, inhash, outhash = line.split() - hashes[filename] = (inhash, outhash) - else: - hashes = {} - return hashes - -def save_hashes(hash_db, filename): - with open(filename, 'w') as f: - for key, value in sorted(hash_db.items()): - f.write("%s %s %s\n" % (key, value[0], value[1])) - -def sha1_of_file(filename): - h = hashlib.sha1() - with open(filename, "rb") as f: - h.update(f.read()) - return h.hexdigest() - -# -# Main program -# - -def normpath(path): - path = path.replace(os.sep, '/') - if path.startswith('./'): - path = path[2:] - return path - -def get_hash(frompath, topath): - from_hash = sha1_of_file(frompath) - to_hash = sha1_of_file(topath) if os.path.exists(topath) else None - return (from_hash, to_hash) - -def process(path, fromfile, tofile, processor_function, hash_db): - fullfrompath = os.path.join(path, fromfile) - fulltopath = os.path.join(path, tofile) - current_hash = get_hash(fullfrompath, fulltopath) - if current_hash == hash_db.get(normpath(fullfrompath), None): - print('%s has not changed' % fullfrompath) - return +def preserve_cwd(path, func, *args): orig_cwd = os.getcwd() try: os.chdir(path) - print('Processing %s' % fullfrompath) - processor_function(fromfile, tofile) + func(*args) finally: os.chdir(orig_cwd) - # changed target file, recompute hash - current_hash = get_hash(fullfrompath, fulltopath) - # store hash in db - hash_db[normpath(fullfrompath)] = current_hash -def find_process_files(root_dir): - hash_db = load_hashes(HASH_FILE) - for cur_dir, dirs, files in os.walk(root_dir): - for filename in files: - in_file = os.path.join(cur_dir, filename + ".in") - if filename.endswith('.pyx') and os.path.isfile(in_file): - continue - for fromext, function in rules.items(): - if filename.endswith(fromext): - toext = ".cpp" - # with open(os.path.join(cur_dir, filename), 'rb') as f: - # data = f.read() - # m = re.search(br"^\s*#\s*distutils:\s*language\s*=\s*c\+\+\s*$", data, re.I|re.M) - # if m: - # toext = ".cxx" - fromfile = filename - tofile = filename[:-len(fromext)] + toext - process(cur_dir, fromfile, tofile, function, hash_db) - save_hashes(hash_db, HASH_FILE) - -def main(): +def load_hashes(filename): try: - root_dir = sys.argv[1] - except IndexError: - root_dir = DEFAULT_ROOT - find_process_files(root_dir) + return json.load(open(filename)) + except (ValueError, IOError): + return {} + + +def save_hashes(hash_db, filename): + json.dump(hash_db, open(filename, 'w')) + + +def get_hash(path): + return hashlib.md5(open(path).read()).hexdigest() + + +def hash_changed(base, path, db): + full_path = os.path.normpath(os.path.join(base, path)) + return not get_hash(full_path) == db.get(full_path) + + +def hash_add(base, path, db): + full_path = os.path.normpath(os.path.join(base, path)) + db[full_path] = get_hash(full_path) + + +def process(base, filename, db): + root, ext = os.path.splitext(filename) + if ext in ['.pyx', '.cpp']: + if hash_changed(base, filename, db): + preserve_cwd(base, process_pyx, root + '.pyx', root + '.cpp') + hash_add(base, root + '.cpp', db) + hash_add(base, root + '.pyx', db) + + +def check_changes(root, db): + res = False + new_db = {} + + setup_filename = 'setup.py' + hash_add('.', setup_filename, new_db) + if hash_changed('.', setup_filename, db): + res = True + + for base, _, files in os.walk(root): + for filename in files: + if filename.endswith('.pxd'): + hash_add(base, filename, new_db) + if hash_changed(base, filename, db): + res = True + + if res: + db.clear() + db.update(new_db) + return res + + +def run(root): + db = load_hashes(HASH_FILE) + + try: + check_changes(root, db) + for base, _, files in os.walk(root): + for filename in files: + process(base, filename, db) + finally: + save_hashes(db, HASH_FILE) if __name__ == '__main__': - main() \ No newline at end of file + parser = argparse.ArgumentParser(description='Cythonize pyx files into C++ files as needed') + parser.add_argument('root', help='root directory') + args = parser.parse_args() + run(args.root)