mirror of https://github.com/explosion/spaCy.git
new approach to dependency headers
This commit is contained in:
parent
a9fc35d3bf
commit
ac318b568c
|
@ -0,0 +1 @@
|
|||
recursive-include include *.h
|
436
setup.py
436
setup.py
|
@ -1,231 +1,261 @@
|
|||
#!/usr/bin/env python
|
||||
from setuptools import setup
|
||||
import shutil
|
||||
|
||||
import sys
|
||||
from __future__ import division, print_function
|
||||
import os
|
||||
from os import path
|
||||
|
||||
from setuptools import Extension
|
||||
from distutils import sysconfig
|
||||
from distutils.core import setup, Extension
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from distutils.command.build_ext import build_ext
|
||||
from distutils.sysconfig import get_python_inc
|
||||
|
||||
import platform
|
||||
try:
|
||||
from setuptools import Extension, setup
|
||||
except ImportError:
|
||||
from distutils.core import Extension, setup
|
||||
|
||||
PACKAGE_DATA = {
|
||||
"spacy": ["*.pxd"],
|
||||
"spacy.tokens": ["*.pxd"],
|
||||
"spacy.serialize": ["*.pxd"],
|
||||
"spacy.syntax": ["*.pxd"],
|
||||
"spacy.en": [
|
||||
"*.pxd",
|
||||
"data/wordnet/*.exc",
|
||||
"data/wordnet/index.*",
|
||||
"data/tokenizer/*",
|
||||
"data/vocab/serializer.json"
|
||||
]
|
||||
}
|
||||
|
||||
MAJOR = 0
|
||||
MINOR = 100
|
||||
MICRO = 0
|
||||
ISRELEASED = False
|
||||
VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO)
|
||||
|
||||
|
||||
PACKAGES = [
|
||||
'spacy',
|
||||
'spacy.tokens',
|
||||
'spacy.en',
|
||||
'spacy.serialize',
|
||||
'spacy.syntax',
|
||||
'spacy.munge',
|
||||
'spacy.tests',
|
||||
'spacy.tests.matcher',
|
||||
'spacy.tests.morphology',
|
||||
'spacy.tests.munge',
|
||||
'spacy.tests.parser',
|
||||
'spacy.tests.serialize',
|
||||
'spacy.tests.spans',
|
||||
'spacy.tests.tagger',
|
||||
'spacy.tests.tokenizer',
|
||||
'spacy.tests.tokens',
|
||||
'spacy.tests.vectors',
|
||||
'spacy.tests.vocab']
|
||||
|
||||
|
||||
MOD_NAMES = [
|
||||
'spacy.parts_of_speech',
|
||||
'spacy.strings',
|
||||
'spacy.lexeme',
|
||||
'spacy.vocab',
|
||||
'spacy.attrs',
|
||||
'spacy.morphology',
|
||||
'spacy.tagger',
|
||||
'spacy.syntax.stateclass',
|
||||
'spacy.tokenizer',
|
||||
'spacy.syntax.parser',
|
||||
'spacy.syntax.transition_system',
|
||||
'spacy.syntax.arc_eager',
|
||||
'spacy.syntax._parse_features',
|
||||
'spacy.gold',
|
||||
'spacy.orth',
|
||||
'spacy.tokens.doc',
|
||||
'spacy.tokens.span',
|
||||
'spacy.tokens.token',
|
||||
'spacy.serialize.packer',
|
||||
'spacy.serialize.huffman',
|
||||
'spacy.serialize.bits',
|
||||
'spacy.cfile',
|
||||
'spacy.matcher',
|
||||
'spacy.syntax.ner',
|
||||
'spacy.symbols']
|
||||
|
||||
|
||||
if sys.version_info[:2] < (2, 7) or (3, 0) <= sys.version_info[0:2] < (3, 4):
|
||||
raise RuntimeError('Python version 2.7 or >= 3.4 required.')
|
||||
|
||||
|
||||
# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
|
||||
# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
|
||||
compile_options = {'msvc' : ['/Ox', '/EHsc'] ,
|
||||
'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'] }
|
||||
link_options = {'msvc' : [] ,
|
||||
'other' : [] }
|
||||
compile_options = {'msvc' : ['/Ox', '/EHsc'],
|
||||
'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function']}
|
||||
link_options = {'msvc' : [],
|
||||
'other' : []}
|
||||
|
||||
if sys.platform.startswith('darwin'):
|
||||
compile_options['other'].append('-mmacosx-version-min=10.8')
|
||||
compile_options['other'].append('-stdlib=libc++')
|
||||
link_options['other'].append('-lc++')
|
||||
|
||||
|
||||
class build_ext_options:
|
||||
def build_options(self):
|
||||
c_type = None
|
||||
if self.compiler.compiler_type in compile_options:
|
||||
c_type = self.compiler.compiler_type
|
||||
elif 'other' in compile_options:
|
||||
c_type = 'other'
|
||||
if c_type is not None:
|
||||
for e in self.extensions:
|
||||
e.extra_compile_args = compile_options[c_type]
|
||||
for e in self.extensions:
|
||||
e.extra_compile_args = compile_options.get(
|
||||
self.compiler.compiler_type, compile_options['other'])
|
||||
for e in self.extensions:
|
||||
e.extra_link_args = link_options.get(
|
||||
self.compiler.compiler_type, link_options['other'])
|
||||
|
||||
l_type = None
|
||||
if self.compiler.compiler_type in link_options:
|
||||
l_type = self.compiler.compiler_type
|
||||
elif 'other' in link_options:
|
||||
l_type = 'other'
|
||||
if l_type is not None:
|
||||
for e in self.extensions:
|
||||
e.extra_link_args = link_options[l_type]
|
||||
|
||||
class build_ext_subclass( build_ext, build_ext_options ):
|
||||
class build_ext_subclass(build_ext, build_ext_options):
|
||||
def build_extensions(self):
|
||||
build_ext_options.build_options(self)
|
||||
build_ext.build_extensions(self)
|
||||
|
||||
|
||||
|
||||
# PyPy --- NB! PyPy doesn't really work, it segfaults all over the place. But,
|
||||
# this is necessary to get it compile.
|
||||
# We have to resort to monkey-patching to set the compiler, because pypy broke
|
||||
# all the everything.
|
||||
|
||||
pre_patch_customize_compiler = sysconfig.customize_compiler
|
||||
def my_customize_compiler(compiler):
|
||||
pre_patch_customize_compiler(compiler)
|
||||
compiler.compiler_cxx = ['c++']
|
||||
|
||||
|
||||
if platform.python_implementation() == 'PyPy':
|
||||
sysconfig.customize_compiler = my_customize_compiler
|
||||
# Return the git revision as a string
|
||||
def git_version():
|
||||
def _minimal_ext_cmd(cmd):
|
||||
# construct minimal environment
|
||||
env = {}
|
||||
for k in ['SYSTEMROOT', 'PATH']:
|
||||
v = os.environ.get(k)
|
||||
if v is not None:
|
||||
env[k] = v
|
||||
# LANGUAGE is used on win32
|
||||
env['LANGUAGE'] = 'C'
|
||||
env['LANG'] = 'C'
|
||||
env['LC_ALL'] = 'C'
|
||||
out = subprocess.Popen(cmd, stdout = subprocess.PIPE, env=env).communicate()[0]
|
||||
return out
|
||||
|
||||
#def install_headers():
|
||||
# dest_dir = path.join(sys.prefix, 'include', 'murmurhash')
|
||||
# if not path.exists(dest_dir):
|
||||
# shutil.copytree('murmurhash/headers/murmurhash', dest_dir)
|
||||
#
|
||||
# dest_dir = path.join(sys.prefix, 'include', 'numpy')
|
||||
try:
|
||||
out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
|
||||
GIT_REVISION = out.strip().decode('ascii')
|
||||
except OSError:
|
||||
GIT_REVISION = 'Unknown'
|
||||
|
||||
return GIT_REVISION
|
||||
|
||||
|
||||
includes = ['.', path.join(sys.prefix, 'include')]
|
||||
|
||||
|
||||
try:
|
||||
import numpy
|
||||
numpy_headers = path.join(numpy.get_include(), 'numpy')
|
||||
shutil.copytree(numpy_headers, path.join(sys.prefix, 'include', 'numpy'))
|
||||
except ImportError:
|
||||
pass
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
||||
def clean(mod_names):
|
||||
for name in mod_names:
|
||||
name = name.replace('.', '/')
|
||||
so = name + '.so'
|
||||
html = name + '.html'
|
||||
cpp = name + '.cpp'
|
||||
c = name + '.c'
|
||||
for file_path in [so, html, cpp, c]:
|
||||
if os.path.exists(file_path):
|
||||
os.unlink(file_path)
|
||||
|
||||
|
||||
def name_to_path(mod_name, ext):
|
||||
return '%s.%s' % (mod_name.replace('.', '/'), ext)
|
||||
|
||||
|
||||
def c_ext(mod_name, language, includes):
|
||||
mod_path = name_to_path(mod_name, language)
|
||||
return Extension(mod_name, [mod_path], include_dirs=includes)
|
||||
|
||||
|
||||
def cython_setup(mod_names, language, includes):
|
||||
import Cython.Distutils
|
||||
import Cython.Build
|
||||
import distutils.core
|
||||
|
||||
class build_ext_cython_subclass( Cython.Distutils.build_ext, build_ext_options ):
|
||||
def build_extensions(self):
|
||||
build_ext_options.build_options(self)
|
||||
Cython.Distutils.build_ext.build_extensions(self)
|
||||
|
||||
if language == 'cpp':
|
||||
language = 'c++'
|
||||
exts = []
|
||||
for mod_name in mod_names:
|
||||
mod_path = mod_name.replace('.', '/') + '.pyx'
|
||||
e = Extension(mod_name, [mod_path], language=language, include_dirs=includes)
|
||||
exts.append(e)
|
||||
distutils.core.setup(
|
||||
name='spacy',
|
||||
packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
|
||||
'spacy.syntax', 'spacy.munge'],
|
||||
description="Industrial-strength NLP",
|
||||
author='Matthew Honnibal',
|
||||
author_email='honnibal@gmail.com',
|
||||
version=VERSION,
|
||||
url="http://spacy.io",
|
||||
package_data=PACKAGE_DATA,
|
||||
ext_modules=exts,
|
||||
cmdclass={'build_ext': build_ext_cython_subclass},
|
||||
license="MIT",
|
||||
)
|
||||
|
||||
|
||||
def run_setup(exts):
|
||||
setup(
|
||||
name='spacy',
|
||||
packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
|
||||
'spacy.syntax', 'spacy.munge',
|
||||
'spacy.tests',
|
||||
'spacy.tests.matcher',
|
||||
'spacy.tests.morphology',
|
||||
'spacy.tests.munge',
|
||||
'spacy.tests.parser',
|
||||
'spacy.tests.serialize',
|
||||
'spacy.tests.spans',
|
||||
'spacy.tests.tagger',
|
||||
'spacy.tests.tokenizer',
|
||||
'spacy.tests.tokens',
|
||||
'spacy.tests.vectors',
|
||||
'spacy.tests.vocab'],
|
||||
description="Industrial-strength NLP",
|
||||
author='Matthew Honnibal',
|
||||
author_email='honnibal@gmail.com',
|
||||
version=VERSION,
|
||||
url="http://honnibal.github.io/spaCy/",
|
||||
package_data=PACKAGE_DATA,
|
||||
ext_modules=exts,
|
||||
license="MIT",
|
||||
install_requires=['numpy', 'murmurhash == 0.24', 'cymem == 1.30', 'preshed == 0.44',
|
||||
'thinc == 4.0.0', "text_unidecode", 'plac', 'six',
|
||||
'ujson', 'cloudpickle', 'sputnik == 0.5.2'],
|
||||
setup_requires=["headers_workaround"],
|
||||
cmdclass = {'build_ext': build_ext_subclass },
|
||||
)
|
||||
|
||||
import headers_workaround
|
||||
|
||||
headers_workaround.fix_venv_pypy_include()
|
||||
headers_workaround.install_headers('murmurhash')
|
||||
headers_workaround.install_headers('numpy')
|
||||
|
||||
|
||||
VERSION = '0.100'
|
||||
def main(modules, is_pypy):
|
||||
language = "cpp"
|
||||
includes = ['.', path.join(sys.prefix, 'include')]
|
||||
if sys.platform.startswith('darwin'):
|
||||
compile_options['other'].append('-mmacosx-version-min=10.8')
|
||||
compile_options['other'].append('-stdlib=libc++')
|
||||
link_options['other'].append('-lc++')
|
||||
if use_cython:
|
||||
cython_setup(modules, language, includes)
|
||||
def get_version_info():
|
||||
# Adding the git rev number needs to be done inside write_version_py(),
|
||||
# otherwise the import of spacy.about messes up the build under Python 3.
|
||||
FULLVERSION = VERSION
|
||||
if os.path.exists('.git'):
|
||||
GIT_REVISION = git_version()
|
||||
elif os.path.exists('spacy/about.py'):
|
||||
# must be a source distribution, use existing version file
|
||||
try:
|
||||
from spacy.about import git_revision as GIT_REVISION
|
||||
except ImportError:
|
||||
raise ImportError('Unable to import git_revision. Try removing '
|
||||
'spacy/about.py and the build directory '
|
||||
'before building.')
|
||||
else:
|
||||
exts = [c_ext(mn, language, includes)
|
||||
for mn in modules]
|
||||
run_setup(exts)
|
||||
GIT_REVISION = 'Unknown'
|
||||
|
||||
MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
|
||||
'spacy.lexeme', 'spacy.vocab', 'spacy.attrs',
|
||||
'spacy.morphology', 'spacy.tagger',
|
||||
'spacy.syntax.stateclass',
|
||||
'spacy.tokenizer',
|
||||
'spacy.syntax.parser',
|
||||
'spacy.syntax.transition_system',
|
||||
'spacy.syntax.arc_eager',
|
||||
'spacy.syntax._parse_features',
|
||||
'spacy.gold', 'spacy.orth',
|
||||
'spacy.tokens.doc', 'spacy.tokens.span', 'spacy.tokens.token',
|
||||
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
|
||||
'spacy.cfile', 'spacy.matcher',
|
||||
'spacy.syntax.ner',
|
||||
'spacy.symbols']
|
||||
if not ISRELEASED:
|
||||
FULLVERSION += '.dev0+' + GIT_REVISION[:7]
|
||||
|
||||
return FULLVERSION, GIT_REVISION
|
||||
|
||||
|
||||
def write_version_py(filename='spacy/about.py'):
|
||||
cnt = """# THIS FILE IS GENERATED FROM SPACY SETUP.PY
|
||||
short_version = '%(version)s'
|
||||
version = '%(version)s'
|
||||
full_version = '%(full_version)s'
|
||||
git_revision = '%(git_revision)s'
|
||||
release = %(isrelease)s
|
||||
if not release:
|
||||
version = full_version
|
||||
"""
|
||||
FULLVERSION, GIT_REVISION = get_version_info()
|
||||
|
||||
with open(filename, 'w') as f:
|
||||
f.write(cnt % {'version': VERSION,
|
||||
'full_version' : FULLVERSION,
|
||||
'git_revision' : GIT_REVISION,
|
||||
'isrelease': str(ISRELEASED)})
|
||||
|
||||
|
||||
def generate_cython():
|
||||
cwd = os.path.abspath(os.path.dirname(__file__))
|
||||
print('Cythonizing sources')
|
||||
p = subprocess.call([sys.executable,
|
||||
os.path.join(cwd, 'bin', 'cythonize.py'),
|
||||
'spacy'],
|
||||
cwd=cwd)
|
||||
if p != 0:
|
||||
raise RuntimeError('Running cythonize failed')
|
||||
|
||||
|
||||
def clean():
|
||||
for name in MOD_NAMES:
|
||||
name = name.replace('.', '/')
|
||||
for ext in ['.so', '.html', '.cpp', '.c']:
|
||||
if os.path.exists(name + ext):
|
||||
os.unlink(name + ext)
|
||||
|
||||
|
||||
def setup_package():
|
||||
src_path = os.path.dirname(os.path.abspath(sys.argv[0]))
|
||||
old_path = os.getcwd()
|
||||
os.chdir(src_path)
|
||||
sys.path.insert(0, src_path)
|
||||
|
||||
# Rewrite the version file everytime
|
||||
write_version_py()
|
||||
|
||||
include_dirs = [
|
||||
get_python_inc(plat_specific=True),
|
||||
os.path.join(src_path, 'include')]
|
||||
|
||||
ext_modules = []
|
||||
for mod_name in MOD_NAMES:
|
||||
mod_path = mod_name.replace('.', '/') + '.cpp'
|
||||
ext_modules.append(
|
||||
Extension(mod_name, [mod_path],
|
||||
language='c++', include_dirs=include_dirs))
|
||||
|
||||
metadata = dict(
|
||||
name='spacy',
|
||||
packages=PACKAGES,
|
||||
description='Industrial-strength NLP',
|
||||
author='Matthew Honnibal',
|
||||
author_email='matt@spacy.io',
|
||||
version=VERSION,
|
||||
url='https://spacy.io',
|
||||
license='MIT',
|
||||
ext_modules=ext_modules,
|
||||
install_requires=['numpy', 'murmurhash == 0.24', 'cymem == 1.30', 'preshed == 0.44',
|
||||
'thinc == 4.0.0', 'text_unidecode', 'plac', 'six',
|
||||
'ujson', 'cloudpickle', 'sputnik == 0.5.2'],
|
||||
cmdclass = {
|
||||
'build_ext': build_ext_subclass},
|
||||
)
|
||||
|
||||
# Run build
|
||||
cwd = os.path.abspath(os.path.dirname(__file__))
|
||||
if not os.path.exists(os.path.join(cwd, 'PKG-INFO')):
|
||||
# Generate Cython sources, unless building from source release
|
||||
generate_cython()
|
||||
|
||||
# sync include dirs from native dependencies
|
||||
include_dir = os.path.join(src_path, 'include')
|
||||
if os.path.exists(include_dir):
|
||||
shutil.rmtree(include_dir)
|
||||
os.mkdir(include_dir)
|
||||
|
||||
import numpy
|
||||
shutil.copytree(
|
||||
os.path.join(numpy.get_include(), 'numpy'),
|
||||
os.path.join(include_dir, 'numpy'))
|
||||
|
||||
import murmurhash
|
||||
shutil.copytree(
|
||||
os.path.join(os.path.dirname(murmurhash.__file__), 'headers', 'murmurhash'),
|
||||
os.path.join(include_dir, 'murmurhash'))
|
||||
|
||||
try:
|
||||
setup(**metadata)
|
||||
finally:
|
||||
del sys.path[0]
|
||||
os.chdir(old_path)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if sys.argv[1] == 'clean':
|
||||
clean(MOD_NAMES)
|
||||
clean()
|
||||
else:
|
||||
use_cython = sys.argv[1] == 'build_ext'
|
||||
main(MOD_NAMES, use_cython)
|
||||
setup_package()
|
||||
|
|
|
@ -1,3 +0,0 @@
|
|||
"""Feed-forward neural network, using Thenao."""
|
||||
|
||||
|
146
spacy/_nn.pyx
146
spacy/_nn.pyx
|
@ -1,146 +0,0 @@
|
|||
"""Feed-forward neural network, using Thenao."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
import numpy
|
||||
|
||||
import theano
|
||||
import theano.tensor as T
|
||||
import plac
|
||||
|
||||
from spacy.gold import read_json_file
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
|
||||
|
||||
|
||||
def build_model(n_classes, n_vocab, n_hidden, n_word_embed, n_tag_embed):
|
||||
# allocate symbolic variables for the data
|
||||
words = T.vector('words')
|
||||
tags = T.vector('tags')
|
||||
|
||||
word_e = _init_embedding(n_words, n_word_embed)
|
||||
tag_e = _init_embedding(n_tags, n_tag_embed)
|
||||
label_e = _init_embedding(n_labels, n_label_embed)
|
||||
maxent_W, maxent_b = _init_maxent_weights(n_hidden, n_classes)
|
||||
hidden_W, hidden_b = _init_hidden_weights(28*28, n_hidden, T.tanh)
|
||||
params = [hidden_W, hidden_b, maxent_W, maxent_b, word_e, tag_e, label_e]
|
||||
|
||||
x = T.concatenate([
|
||||
T.flatten(word_e[word_indices], outdim=1),
|
||||
T.flatten(tag_e[tag_indices], outdim=1)])
|
||||
|
||||
p_y_given_x = feed_layer(
|
||||
T.nnet.softmax,
|
||||
maxent_W,
|
||||
maxent_b,
|
||||
feed_layer(
|
||||
T.tanh,
|
||||
hidden_W,
|
||||
hidden_b,
|
||||
x))[0]
|
||||
|
||||
guess = T.argmax(p_y_given_x)
|
||||
|
||||
cost = (
|
||||
-T.log(p_y_given_x[y])
|
||||
+ L1(L1_reg, maxent_W, hidden_W, word_e, tag_e)
|
||||
+ L2(L2_reg, maxent_W, hidden_W, wod_e, tag_e)
|
||||
)
|
||||
|
||||
train_model = theano.function(
|
||||
inputs=[words, tags, y],
|
||||
outputs=guess,
|
||||
updates=[update(learning_rate, param, cost) for param in params]
|
||||
)
|
||||
|
||||
evaluate_model = theano.function(
|
||||
inputs=[x, y],
|
||||
outputs=T.neq(y, T.argmax(p_y_given_x[0])),
|
||||
)
|
||||
return train_model, evaluate_model
|
||||
|
||||
|
||||
def _init_embedding(vocab_size, n_dim):
|
||||
embedding = 0.2 * numpy.random.uniform(-1.0, 1.0, (vocab_size+1, n_dim))
|
||||
return theano.shared(embedding).astype(theano.config.floatX)
|
||||
|
||||
|
||||
def _init_maxent_weights(n_hidden, n_out):
|
||||
weights = numpy.zeros((n_hidden, 10), dtype=theano.config.floatX)
|
||||
bias = numpy.zeros((10,), dtype=theano.config.floatX)
|
||||
return (
|
||||
theano.shared(name='W', borrow=True, value=weights),
|
||||
theano.shared(name='b', borrow=True, value=bias)
|
||||
)
|
||||
|
||||
|
||||
def _init_hidden_weights(n_in, n_out, activation=T.tanh):
|
||||
rng = numpy.random.RandomState(1234)
|
||||
weights = numpy.asarray(
|
||||
rng.uniform(
|
||||
low=-numpy.sqrt(6. / (n_in + n_out)),
|
||||
high=numpy.sqrt(6. / (n_in + n_out)),
|
||||
size=(n_in, n_out)
|
||||
),
|
||||
dtype=theano.config.floatX
|
||||
)
|
||||
|
||||
bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
|
||||
return (
|
||||
theano.shared(value=weights, name='W', borrow=True),
|
||||
theano.shared(value=bias, name='b', borrow=True)
|
||||
)
|
||||
|
||||
|
||||
def feed_layer(activation, weights, bias, input):
|
||||
return activation(T.dot(input, weights) + bias)
|
||||
|
||||
|
||||
def L1(L1_reg, w1, w2):
|
||||
return L1_reg * (abs(w1).sum() + abs(w2).sum())
|
||||
|
||||
|
||||
def L2(L2_reg, w1, w2):
|
||||
return L2_reg * ((w1 ** 2).sum() + (w2 ** 2).sum())
|
||||
|
||||
|
||||
def update(eta, param, cost):
|
||||
return (param, param - (eta * T.grad(cost, param)))
|
||||
|
||||
|
||||
def main(train_loc, eval_loc, model_dir):
|
||||
learning_rate = 0.01
|
||||
L1_reg = 0.00
|
||||
L2_reg = 0.0001
|
||||
|
||||
print "... reading the data"
|
||||
gold_train = list(read_json_file(train_loc))
|
||||
print '... building the model'
|
||||
pos_model_dir = path.join(model_dir, 'pos')
|
||||
if path.exists(pos_model_dir):
|
||||
shutil.rmtree(pos_model_dir)
|
||||
os.mkdir(pos_model_dir)
|
||||
|
||||
setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
|
||||
|
||||
train_model, evaluate_model = build_model(n_hidden, len(POS_TAGS), learning_rate,
|
||||
L1_reg, L2_reg)
|
||||
|
||||
print '... training'
|
||||
for epoch in range(1, n_epochs+1):
|
||||
for raw_text, sents in gold_tuples:
|
||||
for (ids, words, tags, ner, heads, deps), _ in sents:
|
||||
tokens = nlp.tokenizer.tokens_from_list(words)
|
||||
for t in tokens:
|
||||
guess = train_model([t.orth], [t.tag])
|
||||
loss += guess != t.tag
|
||||
print loss
|
||||
# compute zero-one loss on validation set
|
||||
#error = numpy.mean([evaluate_model(x, y) for x, y in dev_examples])
|
||||
#print('epoch %i, validation error %f %%' % (epoch, error * 100))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
|
@ -1,13 +0,0 @@
|
|||
from ._ml cimport Model
|
||||
from thinc.nn cimport InputLayer
|
||||
|
||||
|
||||
cdef class TheanoModel(Model):
|
||||
cdef InputLayer input_layer
|
||||
cdef object train_func
|
||||
cdef object predict_func
|
||||
cdef object debug
|
||||
|
||||
cdef public float eta
|
||||
cdef public float mu
|
||||
cdef public float t
|
|
@ -1,52 +0,0 @@
|
|||
from thinc.api cimport Example, ExampleC
|
||||
from thinc.typedefs cimport weight_t
|
||||
|
||||
from ._ml cimport arg_max_if_true
|
||||
from ._ml cimport arg_max_if_zero
|
||||
|
||||
import numpy
|
||||
from os import path
|
||||
|
||||
|
||||
cdef class TheanoModel(Model):
|
||||
def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None,
|
||||
eta=0.001, mu=0.9, debug=None):
|
||||
if model_loc is not None and path.isdir(model_loc):
|
||||
model_loc = path.join(model_loc, 'model')
|
||||
|
||||
self.eta = eta
|
||||
self.mu = mu
|
||||
self.t = 1
|
||||
initializer = lambda: 0.2 * numpy.random.uniform(-1.0, 1.0)
|
||||
self.input_layer = InputLayer(input_spec, initializer)
|
||||
self.train_func = train_func
|
||||
self.predict_func = predict_func
|
||||
self.debug = debug
|
||||
|
||||
self.n_classes = n_classes
|
||||
self.n_feats = len(self.input_layer)
|
||||
self.model_loc = model_loc
|
||||
|
||||
def predict(self, Example eg):
|
||||
self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=True)
|
||||
theano_scores = self.predict_func(eg.embeddings)[0]
|
||||
cdef int i
|
||||
for i in range(self.n_classes):
|
||||
eg.c.scores[i] = theano_scores[i]
|
||||
eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
|
||||
|
||||
def train(self, Example eg):
|
||||
self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=False)
|
||||
theano_scores, update, y, loss = self.train_func(eg.embeddings, eg.costs,
|
||||
self.eta, self.mu)
|
||||
self.input_layer.update(update, eg.atoms, self.t, self.eta, self.mu)
|
||||
for i in range(self.n_classes):
|
||||
eg.c.scores[i] = theano_scores[i]
|
||||
eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
|
||||
eg.c.best = arg_max_if_zero(eg.c.scores, eg.c.costs, self.n_classes)
|
||||
eg.c.cost = eg.c.costs[eg.c.guess]
|
||||
eg.c.loss = loss
|
||||
self.t += 1
|
||||
|
||||
def end_training(self):
|
||||
pass
|
|
@ -30,8 +30,12 @@ def main(data_size='all', force=False):
|
|||
|
||||
path = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
data_path = os.path.abspath(os.path.join(path, '..', 'data'))
|
||||
if not os.path.isdir(data_path):
|
||||
os.mkdir(data_path)
|
||||
|
||||
command = sputnik.make_command(
|
||||
data_path=os.path.abspath(os.path.join(path, '..', 'data')),
|
||||
data_path=data_path,
|
||||
repository_url='https://index.spacy.io')
|
||||
|
||||
if force:
|
||||
|
|
|
@ -1,62 +0,0 @@
|
|||
# Enum of Wordnet supersenses
|
||||
cimport parts_of_speech
|
||||
from .typedefs cimport flags_t
|
||||
|
||||
cpdef enum:
|
||||
A_behavior
|
||||
A_body
|
||||
A_feeling
|
||||
A_mind
|
||||
A_motion
|
||||
A_perception
|
||||
A_quantity
|
||||
A_relation
|
||||
A_social
|
||||
A_spatial
|
||||
A_substance
|
||||
A_time
|
||||
A_weather
|
||||
N_act
|
||||
N_animal
|
||||
N_artifact
|
||||
N_attribute
|
||||
N_body
|
||||
N_cognition
|
||||
N_communication
|
||||
N_event
|
||||
N_feeling
|
||||
N_food
|
||||
N_group
|
||||
N_location
|
||||
N_motive
|
||||
N_object
|
||||
N_person
|
||||
N_phenomenon
|
||||
N_plant
|
||||
N_possession
|
||||
N_process
|
||||
N_quantity
|
||||
N_relation
|
||||
N_shape
|
||||
N_state
|
||||
N_substance
|
||||
N_time
|
||||
V_body
|
||||
V_change
|
||||
V_cognition
|
||||
V_communication
|
||||
V_competition
|
||||
V_consumption
|
||||
V_contact
|
||||
V_creation
|
||||
V_emotion
|
||||
V_motion
|
||||
V_perception
|
||||
V_possession
|
||||
V_social
|
||||
V_stative
|
||||
V_weather
|
||||
|
||||
|
||||
cdef flags_t[<int>parts_of_speech.N_UNIV_TAGS] POS_SENSES
|
||||
|
|
@ -1,88 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
cimport parts_of_speech
|
||||
|
||||
|
||||
POS_SENSES[<int>parts_of_speech.NO_TAG] = 0
|
||||
POS_SENSES[<int>parts_of_speech.ADJ] = 0
|
||||
POS_SENSES[<int>parts_of_speech.ADV] = 0
|
||||
POS_SENSES[<int>parts_of_speech.ADP] = 0
|
||||
POS_SENSES[<int>parts_of_speech.CONJ] = 0
|
||||
POS_SENSES[<int>parts_of_speech.DET] = 0
|
||||
POS_SENSES[<int>parts_of_speech.NOUN] = 0
|
||||
POS_SENSES[<int>parts_of_speech.NUM] = 0
|
||||
POS_SENSES[<int>parts_of_speech.PRON] = 0
|
||||
POS_SENSES[<int>parts_of_speech.PRT] = 0
|
||||
POS_SENSES[<int>parts_of_speech.VERB] = 0
|
||||
POS_SENSES[<int>parts_of_speech.X] = 0
|
||||
POS_SENSES[<int>parts_of_speech.PUNCT] = 0
|
||||
POS_SENSES[<int>parts_of_speech.EOL] = 0
|
||||
|
||||
|
||||
cdef int _sense = 0
|
||||
|
||||
for _sense in range(A_behavior, N_act):
|
||||
POS_SENSES[<int>parts_of_speech.ADJ] |= 1 << _sense
|
||||
|
||||
for _sense in range(N_act, V_body):
|
||||
POS_SENSES[<int>parts_of_speech.NOUN] |= 1 << _sense
|
||||
|
||||
for _sense in range(V_body, V_weather+1):
|
||||
POS_SENSES[<int>parts_of_speech.VERB] |= 1 << _sense
|
||||
|
||||
|
||||
|
||||
STRINGS = (
|
||||
'A_behavior',
|
||||
'A_body',
|
||||
'A_feeling',
|
||||
'A_mind',
|
||||
'A_motion',
|
||||
'A_perception',
|
||||
'A_quantity',
|
||||
'A_relation',
|
||||
'A_social',
|
||||
'A_spatial',
|
||||
'A_substance',
|
||||
'A_time',
|
||||
'A_weather',
|
||||
'N_act',
|
||||
'N_animal',
|
||||
'N_artifact',
|
||||
'N_attribute',
|
||||
'N_body',
|
||||
'N_cognition',
|
||||
'N_communication',
|
||||
'N_event',
|
||||
'N_feeling',
|
||||
'N_food',
|
||||
'N_group',
|
||||
'N_location',
|
||||
'N_motive',
|
||||
'N_object',
|
||||
'N_person',
|
||||
'N_phenomenon',
|
||||
'N_plant',
|
||||
'N_possession',
|
||||
'N_process',
|
||||
'N_quantity',
|
||||
'N_relation',
|
||||
'N_shape',
|
||||
'N_state',
|
||||
'N_substance',
|
||||
'N_time',
|
||||
'V_body',
|
||||
'V_change',
|
||||
'V_cognition',
|
||||
'V_communication',
|
||||
'V_competition',
|
||||
'V_consumption',
|
||||
'V_contact',
|
||||
'V_creation',
|
||||
'V_emotion',
|
||||
'V_motion',
|
||||
'V_perception',
|
||||
'V_possession',
|
||||
'V_social',
|
||||
'V_stative',
|
||||
'V_weather'
|
||||
)
|
|
@ -1,12 +0,0 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import spacy.de
|
||||
|
||||
|
||||
#def test_tokenizer():
|
||||
# lang = spacy.de.German()
|
||||
#
|
||||
# doc = lang(u'Biografie: Ein Spiel ist ein Theaterstück des Schweizer Schriftstellers Max Frisch, das 1967 entstand und am 1. Februar 1968 im Schauspielhaus Zürich uraufgeführt wurde. 1984 legte Frisch eine überarbeitete Neufassung vor. Das von Frisch als Komödie bezeichnete Stück greift eines seiner zentralen Themen auf: die Möglichkeit oder Unmöglichkeit des Menschen, seine Identität zu verändern.')
|
||||
# for token in doc:
|
||||
# print(repr(token.string))
|
Loading…
Reference in New Issue