diff --git a/setup.py b/setup.py index fb6a5b718..d37396bc9 100644 --- a/setup.py +++ b/setup.py @@ -92,6 +92,7 @@ def cython_setup(mod_names, language, includes, compile_args, link_args): package_data={"spacy": ["*.pxd"], "spacy.en": ["*.pxd", "data/pos/*", "data/wordnet/*", "data/tokenizer/*", + "data/vocab/tag_map.json", "data/vocab/lexemes.bin", "data/vocab/strings.txt"], "spacy.syntax": ["*.pxd"]}, @@ -134,7 +135,7 @@ def run_setup(exts): headers_workaround.install_headers('numpy') -VERSION = '0.94' +VERSION = '0.95' def main(modules, is_pypy): language = "cpp" includes = ['.', path.join(sys.prefix, 'include')] diff --git a/spacy/en/download.py b/spacy/en/download.py index 01c87a4e4..91f31565b 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -7,7 +7,7 @@ import wget import plac # TODO: Read this from the same source as the setup -VERSION = '0.9.1' +VERSION = '0.9.5' AWS_STORE = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com' diff --git a/tests/test_basic_create.py b/tests/test_basic_create.py new file mode 100644 index 000000000..900a7bc64 --- /dev/null +++ b/tests/test_basic_create.py @@ -0,0 +1,89 @@ +"""Some quick tests that don't depend on data files or on pytest, for debugging the +MS windows build issues.""" +from __future__ import print_function, unicode_literals + +import unittest +import re + +from spacy.lemmatizer import Lemmatizer +from spacy.morphology import Morphology +from spacy.strings import StringStore +from spacy.vocab import Vocab +from spacy.tokenizer import Tokenizer +from spacy.syntax.arc_eager import ArcEager +from spacy._ml import Model +from spacy.tagger import Tagger +from spacy.syntax.parser import Parser +from spacy.matcher import Matcher + + +class TestStringStore(unittest.TestCase): + def test_encode_decode(self): + strings = StringStore() + hello_id = strings[u'Hello'] + world_id = strings[u'World'] + + self.assertNotEqual(hello_id, world_id) + + self.assertEqual(strings[hello_id], u'Hello') + self.assertEqual(strings[world_id], u'World') + + self.assertEqual(strings[u'Hello'], hello_id) + self.assertEqual(strings[u'World'], world_id) + + +class TestMorphology(unittest.TestCase): + def test_create(self): + lemmatizer = Lemmatizer({}, {}, {}) + strings = StringStore() + lemmatizer = Lemmatizer({}, {}, {}) + morphology = Morphology(strings, {}, lemmatizer) + + +class TestVocab(unittest.TestCase): + def test_create(self): + vocab = Vocab() + + def test_get_lexeme(self): + vocab = Vocab() + lexeme = vocab[u'Hello'] + self.assertEqual(lexeme.orth_, u'Hello') + + +class TestTokenizer(unittest.TestCase): + def test_create(self): + vocab = Vocab() + dummy_re = re.compile(r'sklfb;s') + tokenizer = Tokenizer(vocab, {}, dummy_re, dummy_re, dummy_re) + doc = tokenizer(u'I am a document.') + + self.assertEqual(len(doc), 4) + + +class TestTagger(unittest.TestCase): + def test_create(self): + vocab = Vocab() + templates = ((1,),) + model = Model(vocab.morphology.n_tags, templates, model_loc=None) + tagger = Tagger(vocab, model) + + +class TestParser(unittest.TestCase): + def test_create(self): + vocab = Vocab() + templates = ((1,),) + labels_by_action = {0: ['One', 'Two'], 1: ['Two', 'Three']} + transition_system = ArcEager(vocab.strings, labels_by_action) + model = Model(vocab.morphology.n_tags, templates, model_loc=None) + + parser = Parser(vocab.strings, transition_system, model) + + +class TestMatcher(unittest.TestCase): + def test_create(self): + vocab = Vocab() + matcher = Matcher(vocab, {}) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_basic_load.py b/tests/test_basic_load.py new file mode 100644 index 000000000..eb7adbe97 --- /dev/null +++ b/tests/test_basic_load.py @@ -0,0 +1,75 @@ +"""Some quick tests that don't depend on data files or on pytest, for debugging the +MS windows build issues.""" +from __future__ import print_function, unicode_literals + +import unittest +import re +from os import path + +from spacy.lemmatizer import Lemmatizer +from spacy.morphology import Morphology +from spacy.strings import StringStore +from spacy.vocab import Vocab +from spacy.tokenizer import Tokenizer +from spacy.syntax.arc_eager import ArcEager +from spacy._ml import Model +from spacy.tagger import Tagger +from spacy.syntax.parser import Parser +from spacy.matcher import Matcher +from spacy.syntax.parser import get_templates + +from spacy.en import English + +from thinc.learner import LinearModel + + +class TestLoadVocab(unittest.TestCase): + def test_load(self): + vocab = Vocab.from_dir(path.join(English.default_data_dir(), 'vocab')) + + +class TestLoadTokenizer(unittest.TestCase): + def test_load(self): + data_dir = English.default_data_dir() + vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) + tokenizer = Tokenizer.from_dir(vocab, path.join(data_dir, 'tokenizer')) + + +class TestLoadTagger(unittest.TestCase): + def test_load(self): + data_dir = English.default_data_dir() + vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) + tagger = Tagger.from_dir(path.join(data_dir, 'tagger'), vocab) + + +class TestLoadParser(unittest.TestCase): + def test_load(self): + data_dir = English.default_data_dir() + vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) + parser = Parser.from_dir(path.join(data_dir, 'deps'), vocab.strings, ArcEager) + + def test_load_careful(self): + config_data = {"labels": {"0": {"": True}, "1": {"": True}, "2": {"cc": True, "agent": True, "ccomp": True, "prt": True, "meta": True, "nsubjpass": True, "csubj": True, "conj": True, "dobj": True, "neg": True, "csubjpass": True, "mark": True, "auxpass": True, "advcl": True, "aux": True, "ROOT": True, "prep": True, "parataxis": True, "xcomp": True, "nsubj": True, "nummod": True, "advmod": True, "punct": True, "relcl": True, "quantmod": True, "acomp": True, "compound": True, "pcomp": True, "intj": True, "poss": True, "npadvmod": True, "case": True, "attr": True, "dep": True, "appos": True, "det": True, "nmod": True, "amod": True, "dative": True, "pobj": True, "expl": True, "predet": True, "preconj": True, "oprd": True, "acl": True}, "3": {"cc": True, "agent": True, "ccomp": True, "prt": True, "meta": True, "nsubjpass": True, "csubj": True, "conj": True, "acl": True, "poss": True, "neg": True, "mark": True, "auxpass": True, "advcl": True, "aux": True, "amod": True, "ROOT": True, "prep": True, "parataxis": True, "xcomp": True, "nsubj": True, "nummod": True, "advmod": True, "punct": True, "quantmod": True, "acomp": True, "pcomp": True, "intj": True, "relcl": True, "npadvmod": True, "case": True, "attr": True, "dep": True, "appos": True, "det": True, "nmod": True, "dobj": True, "dative": True, "pobj": True, "iobj": True, "expl": True, "predet": True, "preconj": True, "oprd": True}, "4": {"ROOT": True}}, "seed": 0, "features": "basic", "beam_width": 1} + + data_dir = English.default_data_dir() + vocab = Vocab.from_dir(path.join(data_dir, 'vocab')) + + moves = ArcEager(vocab.strings, config_data['labels']) + templates = get_templates(config_data['features']) + + model = Model(moves.n_moves, templates, path.join(data_dir, 'deps')) + + parser = Parser(vocab.strings, moves, model) + + def test_thinc_load(self): + data_dir = English.default_data_dir() + model_loc = path.join(data_dir, 'deps', 'model') + + # n classes. moves.n_moves above + # n features. len(templates) + 1 above + model = LinearModel(92, 116) + model.load(model_loc) + + +if __name__ == '__main__': + unittest.main()