new approach to dependency headers

This commit is contained in:
Henning Peters 2015-12-13 11:49:17 +01:00
parent a9fc35d3bf
commit ac318b568c
11 changed files with 252 additions and 580 deletions

View File

@ -0,0 +1 @@
recursive-include include *.h

436
setup.py
View File

@ -1,231 +1,261 @@
#!/usr/bin/env python #!/usr/bin/env python
from setuptools import setup from __future__ import division, print_function
import shutil
import sys
import os import os
from os import path import shutil
import subprocess
from setuptools import Extension import sys
from distutils import sysconfig
from distutils.core import setup, Extension
from distutils.command.build_ext import build_ext from distutils.command.build_ext import build_ext
from distutils.sysconfig import get_python_inc
import platform try:
from setuptools import Extension, setup
except ImportError:
from distutils.core import Extension, setup
PACKAGE_DATA = {
"spacy": ["*.pxd"], MAJOR = 0
"spacy.tokens": ["*.pxd"], MINOR = 100
"spacy.serialize": ["*.pxd"], MICRO = 0
"spacy.syntax": ["*.pxd"], ISRELEASED = False
"spacy.en": [ VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO)
"*.pxd",
"data/wordnet/*.exc",
"data/wordnet/index.*", PACKAGES = [
"data/tokenizer/*", 'spacy',
"data/vocab/serializer.json" 'spacy.tokens',
] 'spacy.en',
} 'spacy.serialize',
'spacy.syntax',
'spacy.munge',
'spacy.tests',
'spacy.tests.matcher',
'spacy.tests.morphology',
'spacy.tests.munge',
'spacy.tests.parser',
'spacy.tests.serialize',
'spacy.tests.spans',
'spacy.tests.tagger',
'spacy.tests.tokenizer',
'spacy.tests.tokens',
'spacy.tests.vectors',
'spacy.tests.vocab']
MOD_NAMES = [
'spacy.parts_of_speech',
'spacy.strings',
'spacy.lexeme',
'spacy.vocab',
'spacy.attrs',
'spacy.morphology',
'spacy.tagger',
'spacy.syntax.stateclass',
'spacy.tokenizer',
'spacy.syntax.parser',
'spacy.syntax.transition_system',
'spacy.syntax.arc_eager',
'spacy.syntax._parse_features',
'spacy.gold',
'spacy.orth',
'spacy.tokens.doc',
'spacy.tokens.span',
'spacy.tokens.token',
'spacy.serialize.packer',
'spacy.serialize.huffman',
'spacy.serialize.bits',
'spacy.cfile',
'spacy.matcher',
'spacy.syntax.ner',
'spacy.symbols']
if sys.version_info[:2] < (2, 7) or (3, 0) <= sys.version_info[0:2] < (3, 4):
raise RuntimeError('Python version 2.7 or >= 3.4 required.')
# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options # By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used # http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
compile_options = {'msvc' : ['/Ox', '/EHsc'] , compile_options = {'msvc' : ['/Ox', '/EHsc'],
'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'] } 'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function']}
link_options = {'msvc' : [] , link_options = {'msvc' : [],
'other' : [] } 'other' : []}
if sys.platform.startswith('darwin'):
compile_options['other'].append('-mmacosx-version-min=10.8')
compile_options['other'].append('-stdlib=libc++')
link_options['other'].append('-lc++')
class build_ext_options: class build_ext_options:
def build_options(self): def build_options(self):
c_type = None for e in self.extensions:
if self.compiler.compiler_type in compile_options: e.extra_compile_args = compile_options.get(
c_type = self.compiler.compiler_type self.compiler.compiler_type, compile_options['other'])
elif 'other' in compile_options: for e in self.extensions:
c_type = 'other' e.extra_link_args = link_options.get(
if c_type is not None: self.compiler.compiler_type, link_options['other'])
for e in self.extensions:
e.extra_compile_args = compile_options[c_type]
l_type = None
if self.compiler.compiler_type in link_options:
l_type = self.compiler.compiler_type
elif 'other' in link_options:
l_type = 'other'
if l_type is not None:
for e in self.extensions:
e.extra_link_args = link_options[l_type]
class build_ext_subclass( build_ext, build_ext_options ): class build_ext_subclass(build_ext, build_ext_options):
def build_extensions(self): def build_extensions(self):
build_ext_options.build_options(self) build_ext_options.build_options(self)
build_ext.build_extensions(self) build_ext.build_extensions(self)
# PyPy --- NB! PyPy doesn't really work, it segfaults all over the place. But,
# this is necessary to get it compile.
# We have to resort to monkey-patching to set the compiler, because pypy broke
# all the everything.
pre_patch_customize_compiler = sysconfig.customize_compiler
def my_customize_compiler(compiler):
pre_patch_customize_compiler(compiler)
compiler.compiler_cxx = ['c++']
if platform.python_implementation() == 'PyPy': # Return the git revision as a string
sysconfig.customize_compiler = my_customize_compiler def git_version():
def _minimal_ext_cmd(cmd):
# construct minimal environment
env = {}
for k in ['SYSTEMROOT', 'PATH']:
v = os.environ.get(k)
if v is not None:
env[k] = v
# LANGUAGE is used on win32
env['LANGUAGE'] = 'C'
env['LANG'] = 'C'
env['LC_ALL'] = 'C'
out = subprocess.Popen(cmd, stdout = subprocess.PIPE, env=env).communicate()[0]
return out
#def install_headers(): try:
# dest_dir = path.join(sys.prefix, 'include', 'murmurhash') out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
# if not path.exists(dest_dir): GIT_REVISION = out.strip().decode('ascii')
# shutil.copytree('murmurhash/headers/murmurhash', dest_dir) except OSError:
# GIT_REVISION = 'Unknown'
# dest_dir = path.join(sys.prefix, 'include', 'numpy')
return GIT_REVISION
includes = ['.', path.join(sys.prefix, 'include')] def get_version_info():
# Adding the git rev number needs to be done inside write_version_py(),
# otherwise the import of spacy.about messes up the build under Python 3.
try: FULLVERSION = VERSION
import numpy if os.path.exists('.git'):
numpy_headers = path.join(numpy.get_include(), 'numpy') GIT_REVISION = git_version()
shutil.copytree(numpy_headers, path.join(sys.prefix, 'include', 'numpy')) elif os.path.exists('spacy/about.py'):
except ImportError: # must be a source distribution, use existing version file
pass try:
except OSError: from spacy.about import git_revision as GIT_REVISION
pass except ImportError:
raise ImportError('Unable to import git_revision. Try removing '
'spacy/about.py and the build directory '
'before building.')
def clean(mod_names):
for name in mod_names:
name = name.replace('.', '/')
so = name + '.so'
html = name + '.html'
cpp = name + '.cpp'
c = name + '.c'
for file_path in [so, html, cpp, c]:
if os.path.exists(file_path):
os.unlink(file_path)
def name_to_path(mod_name, ext):
return '%s.%s' % (mod_name.replace('.', '/'), ext)
def c_ext(mod_name, language, includes):
mod_path = name_to_path(mod_name, language)
return Extension(mod_name, [mod_path], include_dirs=includes)
def cython_setup(mod_names, language, includes):
import Cython.Distutils
import Cython.Build
import distutils.core
class build_ext_cython_subclass( Cython.Distutils.build_ext, build_ext_options ):
def build_extensions(self):
build_ext_options.build_options(self)
Cython.Distutils.build_ext.build_extensions(self)
if language == 'cpp':
language = 'c++'
exts = []
for mod_name in mod_names:
mod_path = mod_name.replace('.', '/') + '.pyx'
e = Extension(mod_name, [mod_path], language=language, include_dirs=includes)
exts.append(e)
distutils.core.setup(
name='spacy',
packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
'spacy.syntax', 'spacy.munge'],
description="Industrial-strength NLP",
author='Matthew Honnibal',
author_email='honnibal@gmail.com',
version=VERSION,
url="http://spacy.io",
package_data=PACKAGE_DATA,
ext_modules=exts,
cmdclass={'build_ext': build_ext_cython_subclass},
license="MIT",
)
def run_setup(exts):
setup(
name='spacy',
packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
'spacy.syntax', 'spacy.munge',
'spacy.tests',
'spacy.tests.matcher',
'spacy.tests.morphology',
'spacy.tests.munge',
'spacy.tests.parser',
'spacy.tests.serialize',
'spacy.tests.spans',
'spacy.tests.tagger',
'spacy.tests.tokenizer',
'spacy.tests.tokens',
'spacy.tests.vectors',
'spacy.tests.vocab'],
description="Industrial-strength NLP",
author='Matthew Honnibal',
author_email='honnibal@gmail.com',
version=VERSION,
url="http://honnibal.github.io/spaCy/",
package_data=PACKAGE_DATA,
ext_modules=exts,
license="MIT",
install_requires=['numpy', 'murmurhash == 0.24', 'cymem == 1.30', 'preshed == 0.44',
'thinc == 4.0.0', "text_unidecode", 'plac', 'six',
'ujson', 'cloudpickle', 'sputnik == 0.5.2'],
setup_requires=["headers_workaround"],
cmdclass = {'build_ext': build_ext_subclass },
)
import headers_workaround
headers_workaround.fix_venv_pypy_include()
headers_workaround.install_headers('murmurhash')
headers_workaround.install_headers('numpy')
VERSION = '0.100'
def main(modules, is_pypy):
language = "cpp"
includes = ['.', path.join(sys.prefix, 'include')]
if sys.platform.startswith('darwin'):
compile_options['other'].append('-mmacosx-version-min=10.8')
compile_options['other'].append('-stdlib=libc++')
link_options['other'].append('-lc++')
if use_cython:
cython_setup(modules, language, includes)
else: else:
exts = [c_ext(mn, language, includes) GIT_REVISION = 'Unknown'
for mn in modules]
run_setup(exts)
MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', if not ISRELEASED:
'spacy.lexeme', 'spacy.vocab', 'spacy.attrs', FULLVERSION += '.dev0+' + GIT_REVISION[:7]
'spacy.morphology', 'spacy.tagger',
'spacy.syntax.stateclass', return FULLVERSION, GIT_REVISION
'spacy.tokenizer',
'spacy.syntax.parser',
'spacy.syntax.transition_system', def write_version_py(filename='spacy/about.py'):
'spacy.syntax.arc_eager', cnt = """# THIS FILE IS GENERATED FROM SPACY SETUP.PY
'spacy.syntax._parse_features', short_version = '%(version)s'
'spacy.gold', 'spacy.orth', version = '%(version)s'
'spacy.tokens.doc', 'spacy.tokens.span', 'spacy.tokens.token', full_version = '%(full_version)s'
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits', git_revision = '%(git_revision)s'
'spacy.cfile', 'spacy.matcher', release = %(isrelease)s
'spacy.syntax.ner', if not release:
'spacy.symbols'] version = full_version
"""
FULLVERSION, GIT_REVISION = get_version_info()
with open(filename, 'w') as f:
f.write(cnt % {'version': VERSION,
'full_version' : FULLVERSION,
'git_revision' : GIT_REVISION,
'isrelease': str(ISRELEASED)})
def generate_cython():
cwd = os.path.abspath(os.path.dirname(__file__))
print('Cythonizing sources')
p = subprocess.call([sys.executable,
os.path.join(cwd, 'bin', 'cythonize.py'),
'spacy'],
cwd=cwd)
if p != 0:
raise RuntimeError('Running cythonize failed')
def clean():
for name in MOD_NAMES:
name = name.replace('.', '/')
for ext in ['.so', '.html', '.cpp', '.c']:
if os.path.exists(name + ext):
os.unlink(name + ext)
def setup_package():
src_path = os.path.dirname(os.path.abspath(sys.argv[0]))
old_path = os.getcwd()
os.chdir(src_path)
sys.path.insert(0, src_path)
# Rewrite the version file everytime
write_version_py()
include_dirs = [
get_python_inc(plat_specific=True),
os.path.join(src_path, 'include')]
ext_modules = []
for mod_name in MOD_NAMES:
mod_path = mod_name.replace('.', '/') + '.cpp'
ext_modules.append(
Extension(mod_name, [mod_path],
language='c++', include_dirs=include_dirs))
metadata = dict(
name='spacy',
packages=PACKAGES,
description='Industrial-strength NLP',
author='Matthew Honnibal',
author_email='matt@spacy.io',
version=VERSION,
url='https://spacy.io',
license='MIT',
ext_modules=ext_modules,
install_requires=['numpy', 'murmurhash == 0.24', 'cymem == 1.30', 'preshed == 0.44',
'thinc == 4.0.0', 'text_unidecode', 'plac', 'six',
'ujson', 'cloudpickle', 'sputnik == 0.5.2'],
cmdclass = {
'build_ext': build_ext_subclass},
)
# Run build
cwd = os.path.abspath(os.path.dirname(__file__))
if not os.path.exists(os.path.join(cwd, 'PKG-INFO')):
# Generate Cython sources, unless building from source release
generate_cython()
# sync include dirs from native dependencies
include_dir = os.path.join(src_path, 'include')
if os.path.exists(include_dir):
shutil.rmtree(include_dir)
os.mkdir(include_dir)
import numpy
shutil.copytree(
os.path.join(numpy.get_include(), 'numpy'),
os.path.join(include_dir, 'numpy'))
import murmurhash
shutil.copytree(
os.path.join(os.path.dirname(murmurhash.__file__), 'headers', 'murmurhash'),
os.path.join(include_dir, 'murmurhash'))
try:
setup(**metadata)
finally:
del sys.path[0]
os.chdir(old_path)
if __name__ == '__main__': if __name__ == '__main__':
if sys.argv[1] == 'clean': if sys.argv[1] == 'clean':
clean(MOD_NAMES) clean()
else: else:
use_cython = sys.argv[1] == 'build_ext' setup_package()
main(MOD_NAMES, use_cython)

View File

@ -1,3 +0,0 @@
"""Feed-forward neural network, using Thenao."""

View File

@ -1,146 +0,0 @@
"""Feed-forward neural network, using Thenao."""
import os
import sys
import time
import numpy
import theano
import theano.tensor as T
import plac
from spacy.gold import read_json_file
from spacy.gold import GoldParse
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
def build_model(n_classes, n_vocab, n_hidden, n_word_embed, n_tag_embed):
# allocate symbolic variables for the data
words = T.vector('words')
tags = T.vector('tags')
word_e = _init_embedding(n_words, n_word_embed)
tag_e = _init_embedding(n_tags, n_tag_embed)
label_e = _init_embedding(n_labels, n_label_embed)
maxent_W, maxent_b = _init_maxent_weights(n_hidden, n_classes)
hidden_W, hidden_b = _init_hidden_weights(28*28, n_hidden, T.tanh)
params = [hidden_W, hidden_b, maxent_W, maxent_b, word_e, tag_e, label_e]
x = T.concatenate([
T.flatten(word_e[word_indices], outdim=1),
T.flatten(tag_e[tag_indices], outdim=1)])
p_y_given_x = feed_layer(
T.nnet.softmax,
maxent_W,
maxent_b,
feed_layer(
T.tanh,
hidden_W,
hidden_b,
x))[0]
guess = T.argmax(p_y_given_x)
cost = (
-T.log(p_y_given_x[y])
+ L1(L1_reg, maxent_W, hidden_W, word_e, tag_e)
+ L2(L2_reg, maxent_W, hidden_W, wod_e, tag_e)
)
train_model = theano.function(
inputs=[words, tags, y],
outputs=guess,
updates=[update(learning_rate, param, cost) for param in params]
)
evaluate_model = theano.function(
inputs=[x, y],
outputs=T.neq(y, T.argmax(p_y_given_x[0])),
)
return train_model, evaluate_model
def _init_embedding(vocab_size, n_dim):
embedding = 0.2 * numpy.random.uniform(-1.0, 1.0, (vocab_size+1, n_dim))
return theano.shared(embedding).astype(theano.config.floatX)
def _init_maxent_weights(n_hidden, n_out):
weights = numpy.zeros((n_hidden, 10), dtype=theano.config.floatX)
bias = numpy.zeros((10,), dtype=theano.config.floatX)
return (
theano.shared(name='W', borrow=True, value=weights),
theano.shared(name='b', borrow=True, value=bias)
)
def _init_hidden_weights(n_in, n_out, activation=T.tanh):
rng = numpy.random.RandomState(1234)
weights = numpy.asarray(
rng.uniform(
low=-numpy.sqrt(6. / (n_in + n_out)),
high=numpy.sqrt(6. / (n_in + n_out)),
size=(n_in, n_out)
),
dtype=theano.config.floatX
)
bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
return (
theano.shared(value=weights, name='W', borrow=True),
theano.shared(value=bias, name='b', borrow=True)
)
def feed_layer(activation, weights, bias, input):
return activation(T.dot(input, weights) + bias)
def L1(L1_reg, w1, w2):
return L1_reg * (abs(w1).sum() + abs(w2).sum())
def L2(L2_reg, w1, w2):
return L2_reg * ((w1 ** 2).sum() + (w2 ** 2).sum())
def update(eta, param, cost):
return (param, param - (eta * T.grad(cost, param)))
def main(train_loc, eval_loc, model_dir):
learning_rate = 0.01
L1_reg = 0.00
L2_reg = 0.0001
print "... reading the data"
gold_train = list(read_json_file(train_loc))
print '... building the model'
pos_model_dir = path.join(model_dir, 'pos')
if path.exists(pos_model_dir):
shutil.rmtree(pos_model_dir)
os.mkdir(pos_model_dir)
setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
train_model, evaluate_model = build_model(n_hidden, len(POS_TAGS), learning_rate,
L1_reg, L2_reg)
print '... training'
for epoch in range(1, n_epochs+1):
for raw_text, sents in gold_tuples:
for (ids, words, tags, ner, heads, deps), _ in sents:
tokens = nlp.tokenizer.tokens_from_list(words)
for t in tokens:
guess = train_model([t.orth], [t.tag])
loss += guess != t.tag
print loss
# compute zero-one loss on validation set
#error = numpy.mean([evaluate_model(x, y) for x, y in dev_examples])
#print('epoch %i, validation error %f %%' % (epoch, error * 100))
if __name__ == '__main__':
plac.call(main)

View File

@ -1,13 +0,0 @@
from ._ml cimport Model
from thinc.nn cimport InputLayer
cdef class TheanoModel(Model):
cdef InputLayer input_layer
cdef object train_func
cdef object predict_func
cdef object debug
cdef public float eta
cdef public float mu
cdef public float t

View File

@ -1,52 +0,0 @@
from thinc.api cimport Example, ExampleC
from thinc.typedefs cimport weight_t
from ._ml cimport arg_max_if_true
from ._ml cimport arg_max_if_zero
import numpy
from os import path
cdef class TheanoModel(Model):
def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None,
eta=0.001, mu=0.9, debug=None):
if model_loc is not None and path.isdir(model_loc):
model_loc = path.join(model_loc, 'model')
self.eta = eta
self.mu = mu
self.t = 1
initializer = lambda: 0.2 * numpy.random.uniform(-1.0, 1.0)
self.input_layer = InputLayer(input_spec, initializer)
self.train_func = train_func
self.predict_func = predict_func
self.debug = debug
self.n_classes = n_classes
self.n_feats = len(self.input_layer)
self.model_loc = model_loc
def predict(self, Example eg):
self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=True)
theano_scores = self.predict_func(eg.embeddings)[0]
cdef int i
for i in range(self.n_classes):
eg.c.scores[i] = theano_scores[i]
eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
def train(self, Example eg):
self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=False)
theano_scores, update, y, loss = self.train_func(eg.embeddings, eg.costs,
self.eta, self.mu)
self.input_layer.update(update, eg.atoms, self.t, self.eta, self.mu)
for i in range(self.n_classes):
eg.c.scores[i] = theano_scores[i]
eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
eg.c.best = arg_max_if_zero(eg.c.scores, eg.c.costs, self.n_classes)
eg.c.cost = eg.c.costs[eg.c.guess]
eg.c.loss = loss
self.t += 1
def end_training(self):
pass

View File

@ -30,8 +30,12 @@ def main(data_size='all', force=False):
path = os.path.dirname(os.path.abspath(__file__)) path = os.path.dirname(os.path.abspath(__file__))
data_path = os.path.abspath(os.path.join(path, '..', 'data'))
if not os.path.isdir(data_path):
os.mkdir(data_path)
command = sputnik.make_command( command = sputnik.make_command(
data_path=os.path.abspath(os.path.join(path, '..', 'data')), data_path=data_path,
repository_url='https://index.spacy.io') repository_url='https://index.spacy.io')
if force: if force:

View File

@ -1,62 +0,0 @@
# Enum of Wordnet supersenses
cimport parts_of_speech
from .typedefs cimport flags_t
cpdef enum:
A_behavior
A_body
A_feeling
A_mind
A_motion
A_perception
A_quantity
A_relation
A_social
A_spatial
A_substance
A_time
A_weather
N_act
N_animal
N_artifact
N_attribute
N_body
N_cognition
N_communication
N_event
N_feeling
N_food
N_group
N_location
N_motive
N_object
N_person
N_phenomenon
N_plant
N_possession
N_process
N_quantity
N_relation
N_shape
N_state
N_substance
N_time
V_body
V_change
V_cognition
V_communication
V_competition
V_consumption
V_contact
V_creation
V_emotion
V_motion
V_perception
V_possession
V_social
V_stative
V_weather
cdef flags_t[<int>parts_of_speech.N_UNIV_TAGS] POS_SENSES

View File

@ -1,88 +0,0 @@
from __future__ import unicode_literals
cimport parts_of_speech
POS_SENSES[<int>parts_of_speech.NO_TAG] = 0
POS_SENSES[<int>parts_of_speech.ADJ] = 0
POS_SENSES[<int>parts_of_speech.ADV] = 0
POS_SENSES[<int>parts_of_speech.ADP] = 0
POS_SENSES[<int>parts_of_speech.CONJ] = 0
POS_SENSES[<int>parts_of_speech.DET] = 0
POS_SENSES[<int>parts_of_speech.NOUN] = 0
POS_SENSES[<int>parts_of_speech.NUM] = 0
POS_SENSES[<int>parts_of_speech.PRON] = 0
POS_SENSES[<int>parts_of_speech.PRT] = 0
POS_SENSES[<int>parts_of_speech.VERB] = 0
POS_SENSES[<int>parts_of_speech.X] = 0
POS_SENSES[<int>parts_of_speech.PUNCT] = 0
POS_SENSES[<int>parts_of_speech.EOL] = 0
cdef int _sense = 0
for _sense in range(A_behavior, N_act):
POS_SENSES[<int>parts_of_speech.ADJ] |= 1 << _sense
for _sense in range(N_act, V_body):
POS_SENSES[<int>parts_of_speech.NOUN] |= 1 << _sense
for _sense in range(V_body, V_weather+1):
POS_SENSES[<int>parts_of_speech.VERB] |= 1 << _sense
STRINGS = (
'A_behavior',
'A_body',
'A_feeling',
'A_mind',
'A_motion',
'A_perception',
'A_quantity',
'A_relation',
'A_social',
'A_spatial',
'A_substance',
'A_time',
'A_weather',
'N_act',
'N_animal',
'N_artifact',
'N_attribute',
'N_body',
'N_cognition',
'N_communication',
'N_event',
'N_feeling',
'N_food',
'N_group',
'N_location',
'N_motive',
'N_object',
'N_person',
'N_phenomenon',
'N_plant',
'N_possession',
'N_process',
'N_quantity',
'N_relation',
'N_shape',
'N_state',
'N_substance',
'N_time',
'V_body',
'V_change',
'V_cognition',
'V_communication',
'V_competition',
'V_consumption',
'V_contact',
'V_creation',
'V_emotion',
'V_motion',
'V_perception',
'V_possession',
'V_social',
'V_stative',
'V_weather'
)

View File

@ -1,12 +0,0 @@
# encoding: utf8
from __future__ import unicode_literals
import spacy.de
#def test_tokenizer():
# lang = spacy.de.German()
#
# doc = lang(u'Biografie: Ein Spiel ist ein Theaterstück des Schweizer Schriftstellers Max Frisch, das 1967 entstand und am 1. Februar 1968 im Schauspielhaus Zürich uraufgeführt wurde. 1984 legte Frisch eine überarbeitete Neufassung vor. Das von Frisch als Komödie bezeichnete Stück greift eines seiner zentralen Themen auf: die Möglichkeit oder Unmöglichkeit des Menschen, seine Identität zu verändern.')
# for token in doc:
# print(repr(token.string))

13
tox.ini Normal file
View File

@ -0,0 +1,13 @@
[tox]
envlist =
py27
py34
recreate = True
[testenv]
changedir = {envtmpdir}
deps =
pytest
commands =
python -m spacy.en.download
python -m pytest {toxinidir}/spacy/ --models --vectors --slow