From 4131e455435caeef4324b3c8c937d61b320565a8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Dec 2015 16:55:03 +0100 Subject: [PATCH 01/26] * Add MockPackage class, to see whether we can proxy for Sputnik in a lightweight way --- spacy/util.py | 75 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 12 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 8c9ea319c..2b6f50a6b 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -2,23 +2,74 @@ import os import io import json import re - -from sputnik import Sputnik +import os.path +from contextlib import contextmanager from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE -def get_package(name=None, data_path=None): - if data_path is None: - if os.environ.get('SPACY_DATA'): - data_path = os.environ.get('SPACY_DATA') - else: - data_path = os.path.abspath( - os.path.join(os.path.dirname(__file__), 'data')) +def local_path(subdir): + return os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')) - sputnik = Sputnik('spacy', '0.100.0') # TODO: retrieve version - pool = sputnik.pool(data_path) - return pool.get(name or 'en_default') + +class MockPackage(object): + @classmethod + def create_or_return(cls, me_or_arg): + return me_or_arg if isinstance(me_or_arg, cls) else me_or_arg + + def __init__(self, data_path=None): + if data_path is None: + data_path = local_path('data') + self.name = None + self.data_path = data_path + self._root = self.data_path + + def get(self, key): + pass + + def has_file(self, *path_parts): + return os.path.exists(os.path.join(self._root, *path_parts)) + + def file_path(self, *path_parts, **kwargs): + return os.path.join(self._root, *path_parts) + + def dir_path(self, *path_parts, **kwargs): + return os.path.join(self._root, *path_parts) + + def load_utf8(self, func, *path_parts, **kwargs): + if kwargs.get('require', True): + with io.open(self.file_path(os.path.join(*path_parts)), + mode='r', encoding='utf8') as f: + return func(f) + else: + return None + + @contextmanager + def open(self, path_parts, default=IOError): + if isinstance(default, Exception): + raise default + + # Enter + file_ = io.open(self.file_path(os.path.join(*path_parts)), + mode='r', encoding='utf8') + yield file_ + # Exit + file_.close() + + + +def get_package(name=None, data_path=None): + return MockPackage(data_path) + #if data_path is None: + # if os.environ.get('SPACY_DATA'): + # data_path = os.environ.get('SPACY_DATA') + # else: + # data_path = os.path.abspath( + # os.path.join(os.path.dirname(__file__), 'data')) + + #sputnik = Sputnik('spacy', '0.100.0') # TODO: retrieve version + #pool = sputnik.pool(data_path) + #return pool.get(name or 'en_default') def normalize_slice(length, start, stop, step=None): From c5902f2b4bb4f5b0154cf12064b3180e02f5f960 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Dec 2015 16:56:02 +0100 Subject: [PATCH 02/26] * Upd Lemmatizer to use MockPackage. Replace from_package with load() classmethod --- spacy/lemmatizer.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index c5b9c1c50..7cd37a331 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -8,25 +8,22 @@ except ImportError: import json from .parts_of_speech import NOUN, VERB, ADJ, PUNCT +from .util import MockPackage class Lemmatizer(object): @classmethod - def from_package(cls, package): + def load(cls, pkg_or_str_or_file): + pkg = MockPackage.create_or_return(pkg_or_str_or_file) index = {} exc = {} for pos in ['adj', 'noun', 'verb']: - index[pos] = package.load_utf8(read_index, - 'wordnet', 'index.%s' % pos, - default=set()) # TODO: really optional? - exc[pos] = package.load_utf8(read_exc, - 'wordnet', '%s.exc' % pos, - default={}) # TODO: really optional? - - rules = package.load_utf8(json.load, - 'vocab', 'lemma_rules.json', - default={}) # TODO: really optional? - + with pkg.open(('wordnet', 'index.%s' % pos), default=None) as file_: + index[pos] = read_index(file_) if file_ is not None else set() + with pkg.open(('wordnet', '%s.exc' % pos), default=None) as file_: + exc[pos] = read_exc(file_) if file_ is not None else {} + with pkg.open(('vocab', 'lemma_rules.json'), default=None) as file_: + rules = json.load(file_) if file_ is not None else {} return cls(index, exc, rules) def __init__(self, index, exceptions, rules): From 0e2498da00349e16fe4e74c303172a24da698590 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Dec 2015 16:56:51 +0100 Subject: [PATCH 03/26] * Replace from_package with load() classmethod in Vocab --- spacy/vocab.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index bb0ae6173..ab0a522b1 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -19,6 +19,7 @@ from .orth cimport word_shape from .typedefs cimport attr_t from .cfile cimport CFile from .lemmatizer import Lemmatizer +from .util import MockPackage from . import attrs from . import symbols @@ -47,11 +48,12 @@ cdef class Vocab: '''A map container for a language's LexemeC structs. ''' @classmethod - def from_package(cls, package, get_lex_attr=None): + def load(cls, pkg_or_str_or_file, get_lex_attr=None): + package = MockPackage.create_or_return(pkg_or_str_or_file) tag_map = package.load_utf8(json.load, 'vocab', 'tag_map.json') - lemmatizer = Lemmatizer.from_package(package) + lemmatizer = Lemmatizer.load(package) serializer_freqs = package.load_utf8(json.load, 'vocab', 'serializer.json', @@ -67,7 +69,6 @@ cdef class Vocab: if package.has_file('vocab', 'vec.bin'): # TODO: really optional? self.vectors_length = self.load_vectors_from_bin_loc( package.file_path('vocab', 'vec.bin')) - return self def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None): From aec130af560b1da67a50beaa43366e09a6a3b9d6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Dec 2015 18:00:48 +0100 Subject: [PATCH 04/26] Use util.Package class for io Previous Sputnik integration caused API change: Vocab, Tagger, etc were loaded via a from_package classmethod, that required a sputnik.Package instance. This forced users to first create a sputnik.Sputnik() instance, in order to acquire a Package via sp.pool(). Instead I've created a small file-system shim, util.Package, which allows classes to have a .load() classmethod, that accepts either util.Package objects, or strings. We can later gut the internals of this and make it a proxy for Sputnik if we need more functionality that should live in the Sputnik library. Sputnik is now only used to download and install the data, in spacy.en.download --- spacy/language.py | 35 ++++------------------ spacy/lemmatizer.py | 4 +-- spacy/matcher.pyx | 4 ++- spacy/syntax/parser.pyx | 7 +++++ spacy/tagger.pyx | 10 +++++-- spacy/tests/conftest.py | 8 ++++- spacy/tests/serialize/test_packer.py | 7 ++++- spacy/tests/tagger/test_lemmatizer.py | 11 +++++-- spacy/tests/tokenizer/test_contractions.py | 7 +++++ spacy/tests/website/conftest.py | 6 +++- spacy/tests/website/test_home.py | 8 ++++- spacy/tokenizer.pyx | 8 +++-- spacy/util.py | 28 ++++++++++------- spacy/vocab.pyx | 21 +++++++------ 14 files changed, 97 insertions(+), 67 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index c992335b3..d16032c47 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -20,7 +20,7 @@ from .syntax.ner import BiluoPushDown from .syntax.arc_eager import ArcEager from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD -from .util import get_package +from .util import get_package, MockPackage class Language(object): @@ -142,7 +142,7 @@ class Language(object): package = get_package() if get_lex_attr is None: get_lex_attr = cls.default_lex_attrs() - return Vocab.from_package(package, get_lex_attr=get_lex_attr) + return Vocab.load(package, get_lex_attr=get_lex_attr) @classmethod def default_parser(cls, package, vocab): @@ -182,40 +182,17 @@ class Language(object): - Language(model='en_default ==1.0.0') - Language(model='en_default <1.1.0, data_dir='spacy/data') """ - # support non-package data dirs - if data_dir and path.exists(path.join(data_dir, 'vocab')): - class Package(object): - def __init__(self, root): - self.root = root - - def has_file(self, *path_parts): - return path.exists(path.join(self.root, *path_parts)) - - def file_path(self, *path_parts, **kwargs): - return path.join(self.root, *path_parts) - - def dir_path(self, *path_parts, **kwargs): - return path.join(self.root, *path_parts) - - def load_utf8(self, func, *path_parts, **kwargs): - with io.open(self.file_path(path.join(*path_parts)), - mode='r', encoding='utf8') as f: - return func(f) - - warn("using non-package data_dir", DeprecationWarning) - package = Package(data_dir) - else: - package = get_package(name=model, data_path=data_dir) + package = MockPackage(data_dir) if load_vectors is not True: warn("load_vectors is deprecated", DeprecationWarning) if vocab in (None, True): vocab = self.default_vocab(package) self.vocab = vocab if tokenizer in (None, True): - tokenizer = Tokenizer.from_package(package, self.vocab) + tokenizer = Tokenizer.load(package, self.vocab) self.tokenizer = tokenizer if tagger in (None, True): - tagger = Tagger.from_package(package, self.vocab) + tagger = Tagger.load(package, self.vocab) self.tagger = tagger if entity in (None, True): entity = self.default_entity(package, self.vocab) @@ -224,7 +201,7 @@ class Language(object): parser = self.default_parser(package, self.vocab) self.parser = parser if matcher in (None, True): - matcher = Matcher.from_package(package, self.vocab) + matcher = Matcher.load(package, self.vocab) self.matcher = matcher def __reduce__(self): diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 7cd37a331..2362a7842 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -8,13 +8,13 @@ except ImportError: import json from .parts_of_speech import NOUN, VERB, ADJ, PUNCT -from .util import MockPackage +from .util import MockPackage as Package class Lemmatizer(object): @classmethod def load(cls, pkg_or_str_or_file): - pkg = MockPackage.create_or_return(pkg_or_str_or_file) + pkg = Package.create_or_return(pkg_or_str_or_file) index = {} exc = {} for pos in ['adj', 'noun', 'verb']: diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 4d36b7742..6c70a6f68 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -21,6 +21,7 @@ from .tokens.doc cimport Doc from .vocab cimport Vocab from .attrs import FLAG61 as U_ENT +from .util import MockPackage from .attrs import FLAG60 as B2_ENT from .attrs import FLAG59 as B3_ENT @@ -169,7 +170,8 @@ cdef class Matcher: cdef object _patterns @classmethod - def from_package(cls, package, Vocab vocab): + def load(cls, pkg_or_str_or_file, Vocab vocab): + package = MockPackage.create_or_return(pkg_or_str_or_file) patterns = package.load_utf8(json.load, 'vocab', 'gazetteer.json', default={}) # TODO: really optional? diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index dd4fb3bea..c29d59758 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -88,6 +88,13 @@ cdef class Parser: model.load(path.join(model_dir, 'model')) return cls(strings, moves, model) + @classmethod + def load(cls, pkg_or_str_or_file, vocab): + # TODO + raise NotImplementedError( + "This should be here, but isn't yet =/. Use Parser.from_dir") + + def __reduce__(self): return (Parser, (self.moves.strings, self.moves, self.model), None, None) diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 2c05b4a84..decf918d8 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -16,6 +16,8 @@ from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE from .attrs cimport * +from .util import Package + cpdef enum: P2_orth @@ -146,7 +148,8 @@ cdef class Tagger: return cls(vocab, model) @classmethod - def from_package(cls, package, vocab): + def load(cls, pkg_or_str_or_file, vocab): + pkg = Package.create_or_return(pkg_or_str_or_file) # TODO: templates.json deprecated? not present in latest package templates = cls.default_templates() # templates = package.load_utf8(json.load, @@ -156,8 +159,9 @@ cdef class Tagger: model = TaggerModel(vocab.morphology.n_tags, ConjunctionExtracter(N_CONTEXT_FIELDS, templates)) - if package.has_file('pos', 'model'): # TODO: really optional? - model.load(package.file_path('pos', 'model')) + + if pkg.has_file('pos', 'model'): # TODO: really optional? + model.load(pkg.file_path('pos', 'model')) return cls(vocab, model) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 03e728a12..b8a620d88 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -1,11 +1,17 @@ from spacy.en import English import pytest +import os @pytest.fixture(scope="session") def EN(): - return English() + if os.environ.get('SPACY_DATA'): + data_path = os.environ.get('SPACY_DATA') + else: + data_path = None + print("Load EN from %s" % data_path) + return English(data_dir=data_path) def pytest_addoption(parser): diff --git a/spacy/tests/serialize/test_packer.py b/spacy/tests/serialize/test_packer.py index 0e13b2de5..1d3b12117 100644 --- a/spacy/tests/serialize/test_packer.py +++ b/spacy/tests/serialize/test_packer.py @@ -11,6 +11,7 @@ from spacy.vocab import Vocab from spacy.tokens.doc import Doc from spacy.tokenizer import Tokenizer from os import path +import os from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD from spacy.serialize.packer import Packer @@ -20,7 +21,11 @@ from spacy.serialize.bits import BitArray @pytest.fixture def vocab(): - vocab = English.default_vocab() + if os.environ.get('SPACY_DATA'): + data_path = os.environ.get('SPACY_DATA') + else: + data_path = None + vocab = English.default_vocab(package=data_path) lex = vocab['dog'] assert vocab[vocab.strings['dog']].orth_ == 'dog' lex = vocab['the'] diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index e25fbe199..ebcc4e881 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -1,22 +1,27 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals +import os import io import pickle from spacy.lemmatizer import Lemmatizer, read_index, read_exc -from spacy.util import get_package +from spacy.util import get_package, MockPackage import pytest @pytest.fixture def package(): - return get_package() + if os.environ.get('SPACY_DATA'): + data_path = os.environ.get('SPACY_DATA') + else: + data_path = None + return get_package(data_path=data_path) @pytest.fixture def lemmatizer(package): - return Lemmatizer.from_package(package) + return Lemmatizer.load(package) def test_read_index(package): diff --git a/spacy/tests/tokenizer/test_contractions.py b/spacy/tests/tokenizer/test_contractions.py index ea93ff8b4..76597ec5a 100644 --- a/spacy/tests/tokenizer/test_contractions.py +++ b/spacy/tests/tokenizer/test_contractions.py @@ -48,3 +48,10 @@ def test_punct(en_tokenizer): assert len(tokens) == 2 tokens = en_tokenizer("``We've") assert len(tokens) == 3 + + +def test_therell(en_tokenizer): + tokens = en_tokenizer("there'll") + assert len(tokens) == 2 + assert tokens[0].text == "there" + assert tokens[1].text == "'ll" diff --git a/spacy/tests/website/conftest.py b/spacy/tests/website/conftest.py index b4934d20b..d7b4b3252 100644 --- a/spacy/tests/website/conftest.py +++ b/spacy/tests/website/conftest.py @@ -6,7 +6,11 @@ import os @pytest.fixture(scope='session') def nlp(): from spacy.en import English - return English() + if os.environ.get('SPACY_DATA'): + data_path = os.environ.get('SPACY_DATA') + else: + data_path = None + return English(data_dir=data_path) @pytest.fixture() diff --git a/spacy/tests/website/test_home.py b/spacy/tests/website/test_home.py index 5317e7e56..ef13b4677 100644 --- a/spacy/tests/website/test_home.py +++ b/spacy/tests/website/test_home.py @@ -10,8 +10,14 @@ def token(doc): def test_load_resources_and_process_text(): + if os.environ.get('SPACY_DATA'): + data_path = os.environ.get('SPACY_DATA') + else: + data_path = None + print("Load EN from %s" % data_path) + from spacy.en import English - nlp = English() + nlp = English(data_dir=data_path) doc = nlp('Hello, world. Here are two sentences.') diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 345734682..b90945678 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -15,8 +15,9 @@ from .strings cimport hash_string cimport cython from . import util -from .util import read_lang_data from .tokens.doc cimport Doc +from .util import read_lang_data +from .util import MockPackage as Package cdef class Tokenizer: @@ -41,8 +42,9 @@ cdef class Tokenizer: return (self.__class__, args, None, None) @classmethod - def from_package(cls, package, Vocab vocab): - rules, prefix_re, suffix_re, infix_re = read_lang_data(package) + def load(cls, pkg_or_str_or_file, Vocab vocab): + pkg = Package.create_or_return(pkg_or_str_or_file) + rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg) prefix_re = re.compile(prefix_re) suffix_re = re.compile(suffix_re) infix_re = re.compile(infix_re) diff --git a/spacy/util.py b/spacy/util.py index 2b6f50a6b..74558c59c 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -4,6 +4,7 @@ import json import re import os.path from contextlib import contextmanager +import types from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE @@ -12,10 +13,10 @@ def local_path(subdir): return os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')) -class MockPackage(object): +class Package(object): @classmethod def create_or_return(cls, me_or_arg): - return me_or_arg if isinstance(me_or_arg, cls) else me_or_arg + return me_or_arg if isinstance(me_or_arg, cls) else cls(me_or_arg) def __init__(self, data_path=None): if data_path is None: @@ -46,15 +47,20 @@ class MockPackage(object): @contextmanager def open(self, path_parts, default=IOError): - if isinstance(default, Exception): - raise default - - # Enter - file_ = io.open(self.file_path(os.path.join(*path_parts)), - mode='r', encoding='utf8') - yield file_ - # Exit - file_.close() + if not self.has_file(*path_parts): + if isinstance(default, types.TypeType) and issubclass(default, Exception): + raise default(self.file_path(*path_parts)) + elif isinstance(default, Exception): + raise default + else: + yield default + else: + # Enter + file_ = io.open(self.file_path(os.path.join(*path_parts)), + mode='r', encoding='utf8') + yield file_ + # Exit + file_.close() diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ab0a522b1..1444f767e 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -19,7 +19,7 @@ from .orth cimport word_shape from .typedefs cimport attr_t from .cfile cimport CFile from .lemmatizer import Lemmatizer -from .util import MockPackage +from .util import Package from . import attrs from . import symbols @@ -49,24 +49,23 @@ cdef class Vocab: ''' @classmethod def load(cls, pkg_or_str_or_file, get_lex_attr=None): - package = MockPackage.create_or_return(pkg_or_str_or_file) - tag_map = package.load_utf8(json.load, - 'vocab', 'tag_map.json') + package = Package.create_or_return(pkg_or_str_or_file) + with package.open(('vocab', 'tag_map.json'), default=None) as file_: + tag_map = json.load(file_) if file_ is not None else {} lemmatizer = Lemmatizer.load(package) - serializer_freqs = package.load_utf8(json.load, - 'vocab', 'serializer.json', - require=False) # TODO: really optional? + with package.open(('vocab', 'serializer.json'), default=None) as file_: + serializer_freqs = json.load(file_) if file_ is not None else {} cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map, lemmatizer=lemmatizer, serializer_freqs=serializer_freqs) - if package.has_file('vocab', 'strings.json'): # TODO: really optional? - package.load_utf8(self.strings.load, 'vocab', 'strings.json') - self.load_lexemes(package.file_path('vocab', 'lexemes.bin')) + with package.open(('vocab', 'strings.json')) as file_: + self.strings.load(file_) + self.load_lexemes(package.file_path('vocab', 'lexemes.bin')) - if package.has_file('vocab', 'vec.bin'): # TODO: really optional? + if package.has_file('vocab', 'vec.bin'): self.vectors_length = self.load_vectors_from_bin_loc( package.file_path('vocab', 'vec.bin')) return self From a2dfdec85d805594d7af6cbac5fc297e009bf111 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Dec 2015 18:06:09 +0100 Subject: [PATCH 05/26] * Clean up spacy.util --- spacy/util.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 74558c59c..61f708b8f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -63,19 +63,8 @@ class Package(object): file_.close() - def get_package(name=None, data_path=None): - return MockPackage(data_path) - #if data_path is None: - # if os.environ.get('SPACY_DATA'): - # data_path = os.environ.get('SPACY_DATA') - # else: - # data_path = os.path.abspath( - # os.path.join(os.path.dirname(__file__), 'data')) - - #sputnik = Sputnik('spacy', '0.100.0') # TODO: retrieve version - #pool = sputnik.pool(data_path) - #return pool.get(name or 'en_default') + return Package(data_path) def normalize_slice(length, start, stop, step=None): From 86ee9d046d7c20af2fdf39afc9b8cbc8ead9d2a8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Dec 2015 18:07:23 +0100 Subject: [PATCH 06/26] * Remove test that belongs to a change for master --- spacy/tests/tokenizer/test_contractions.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/spacy/tests/tokenizer/test_contractions.py b/spacy/tests/tokenizer/test_contractions.py index 76597ec5a..22b4189fe 100644 --- a/spacy/tests/tokenizer/test_contractions.py +++ b/spacy/tests/tokenizer/test_contractions.py @@ -49,9 +49,3 @@ def test_punct(en_tokenizer): tokens = en_tokenizer("``We've") assert len(tokens) == 3 - -def test_therell(en_tokenizer): - tokens = en_tokenizer("there'll") - assert len(tokens) == 2 - assert tokens[0].text == "there" - assert tokens[1].text == "'ll" From a6ba43ecaf04d1ea50bfb475e7a40e2eefe50bf6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Dec 2015 18:37:26 +0100 Subject: [PATCH 07/26] * Fix errors in packaging revision --- spacy/language.py | 4 ++-- spacy/matcher.pyx | 4 ++-- spacy/tests/tagger/test_lemmatizer.py | 2 +- spacy/tokenizer.pyx | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index d16032c47..0123e1c4f 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -20,7 +20,7 @@ from .syntax.ner import BiluoPushDown from .syntax.arc_eager import ArcEager from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD -from .util import get_package, MockPackage +from .util import get_package, Package class Language(object): @@ -182,7 +182,7 @@ class Language(object): - Language(model='en_default ==1.0.0') - Language(model='en_default <1.1.0, data_dir='spacy/data') """ - package = MockPackage(data_dir) + package = Package(data_dir) if load_vectors is not True: warn("load_vectors is deprecated", DeprecationWarning) if vocab in (None, True): diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 6c70a6f68..842820b58 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -21,7 +21,7 @@ from .tokens.doc cimport Doc from .vocab cimport Vocab from .attrs import FLAG61 as U_ENT -from .util import MockPackage +from .util import Package from .attrs import FLAG60 as B2_ENT from .attrs import FLAG59 as B3_ENT @@ -171,7 +171,7 @@ cdef class Matcher: @classmethod def load(cls, pkg_or_str_or_file, Vocab vocab): - package = MockPackage.create_or_return(pkg_or_str_or_file) + package = Package.create_or_return(pkg_or_str_or_file) patterns = package.load_utf8(json.load, 'vocab', 'gazetteer.json', default={}) # TODO: really optional? diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index ebcc4e881..8ba2cc3ee 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -5,7 +5,7 @@ import io import pickle from spacy.lemmatizer import Lemmatizer, read_index, read_exc -from spacy.util import get_package, MockPackage +from spacy.util import get_package, Package import pytest diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index b90945678..3c1f1e1ab 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -17,7 +17,7 @@ cimport cython from . import util from .tokens.doc cimport Doc from .util import read_lang_data -from .util import MockPackage as Package +from .util import Package cdef class Tokenizer: From 55bcdf8bdd76924be2861d97f3292406959b9165 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Dec 2015 22:32:03 +0100 Subject: [PATCH 08/26] * Fix errors --- spacy/lemmatizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 2362a7842..dfa8b3aa3 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -8,7 +8,7 @@ except ImportError: import json from .parts_of_speech import NOUN, VERB, ADJ, PUNCT -from .util import MockPackage as Package +from .util import Package class Lemmatizer(object): From 029136a00722cdbbb4e1721916b21d5f9767b6b7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 31 Dec 2015 02:45:12 +0100 Subject: [PATCH 09/26] * Fix resource loading for Matcher --- spacy/matcher.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 842820b58..ba4b46fad 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -172,9 +172,9 @@ cdef class Matcher: @classmethod def load(cls, pkg_or_str_or_file, Vocab vocab): package = Package.create_or_return(pkg_or_str_or_file) - patterns = package.load_utf8(json.load, - 'vocab', 'gazetteer.json', - default={}) # TODO: really optional? + + with package.open(('vocab', 'serializer.json'), default=None) as file_: + patterns = json.load(file_) if file_ is not None else {} return cls(vocab, patterns) def __init__(self, vocab, patterns): From eaf2ad59f1943bea698493673500034fbc830ff8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 31 Dec 2015 04:13:15 +0100 Subject: [PATCH 10/26] * Fix use of mock Package object --- spacy/language.py | 2 +- spacy/lemmatizer.py | 3 +- spacy/matcher.pyx | 4 +-- spacy/tests/tagger/test_lemmatizer.py | 6 ++-- spacy/util.py | 49 ++++++++++++++++----------- spacy/vocab.pyx | 6 ++-- 6 files changed, 39 insertions(+), 31 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 0123e1c4f..1dbbc09b1 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -186,7 +186,7 @@ class Language(object): if load_vectors is not True: warn("load_vectors is deprecated", DeprecationWarning) if vocab in (None, True): - vocab = self.default_vocab(package) + vocab = Vocab.load(package, get_lex_attr=self.default_lex_attrs()) self.vocab = vocab if tokenizer in (None, True): tokenizer = Tokenizer.load(package, self.vocab) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index dfa8b3aa3..48f23b4b4 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -22,8 +22,7 @@ class Lemmatizer(object): index[pos] = read_index(file_) if file_ is not None else set() with pkg.open(('wordnet', '%s.exc' % pos), default=None) as file_: exc[pos] = read_exc(file_) if file_ is not None else {} - with pkg.open(('vocab', 'lemma_rules.json'), default=None) as file_: - rules = json.load(file_) if file_ is not None else {} + rules = pkg.load_json(('vocab', 'lemma_rules.json'), default={}) return cls(index, exc, rules) def __init__(self, index, exceptions, rules): diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index ba4b46fad..777cdfbf3 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -172,9 +172,7 @@ cdef class Matcher: @classmethod def load(cls, pkg_or_str_or_file, Vocab vocab): package = Package.create_or_return(pkg_or_str_or_file) - - with package.open(('vocab', 'serializer.json'), default=None) as file_: - patterns = json.load(file_) if file_ is not None else {} + patterns = package.load_json(('vocab', 'gazetteer.json')) return cls(vocab, patterns) def __init__(self, vocab, patterns): diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index 8ba2cc3ee..a73c6dd4b 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -25,14 +25,16 @@ def lemmatizer(package): def test_read_index(package): - index = package.load_utf8(read_index, 'wordnet', 'index.noun') + with package.open(('wordnet', 'index.noun')) as file_: + index = read_index(file_) assert 'man' in index assert 'plantes' not in index assert 'plant' in index def test_read_exc(package): - exc = package.load_utf8(read_exc, 'wordnet', 'verb.exc') + with package.open(('wordnet', 'verb.exc')) as file_: + exc = read_exc(file_) assert exc['was'] == ('be',) diff --git a/spacy/util.py b/spacy/util.py index 61f708b8f..5f148bc01 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -9,8 +9,8 @@ import types from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE -def local_path(subdir): - return os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')) +def local_path(*dirs): + return os.path.abspath(os.path.join(os.path.dirname(__file__), *dirs)) class Package(object): @@ -18,10 +18,10 @@ class Package(object): def create_or_return(cls, me_or_arg): return me_or_arg if isinstance(me_or_arg, cls) else cls(me_or_arg) - def __init__(self, data_path=None): + def __init__(self, data_path=None, model='en_default-1.0.3'): if data_path is None: - data_path = local_path('data') - self.name = None + data_path = local_path('data', model) + self.model = model self.data_path = data_path self._root = self.data_path @@ -37,18 +37,22 @@ class Package(object): def dir_path(self, *path_parts, **kwargs): return os.path.join(self._root, *path_parts) - def load_utf8(self, func, *path_parts, **kwargs): - if kwargs.get('require', True): - with io.open(self.file_path(os.path.join(*path_parts)), - mode='r', encoding='utf8') as f: - return func(f) - else: - return None + def load_json(self, path_parts, default=None): + if not self.has_file(*path_parts): + if _is_error_class(default): + raise default(self.file_path(*path_parts)) + elif isinstance(default, Exception): + raise default + else: + return default + with io.open(self.file_path(os.path.join(*path_parts)), + mode='r', encoding='utf8') as file_: + return json.load(file_) @contextmanager - def open(self, path_parts, default=IOError): + def open(self, path_parts, mode='r', encoding='utf8', default=IOError): if not self.has_file(*path_parts): - if isinstance(default, types.TypeType) and issubclass(default, Exception): + if _is_error_class(default): raise default(self.file_path(*path_parts)) elif isinstance(default, Exception): raise default @@ -57,12 +61,16 @@ class Package(object): else: # Enter file_ = io.open(self.file_path(os.path.join(*path_parts)), - mode='r', encoding='utf8') + mode=mode, encoding='utf8') yield file_ # Exit file_.close() +def _is_error_class(e): + return isinstance(e, types.TypeType) and issubclass(e, Exception) + + def get_package(name=None, data_path=None): return Package(data_path) @@ -92,10 +100,13 @@ def utf8open(loc, mode='r'): def read_lang_data(package): - tokenization = package.load_utf8(json.load, 'tokenizer', 'specials.json') - prefix = package.load_utf8(read_prefix, 'tokenizer', 'prefix.txt') - suffix = package.load_utf8(read_suffix, 'tokenizer', 'suffix.txt') - infix = package.load_utf8(read_infix, 'tokenizer', 'infix.txt') + tokenization = package.load_json(('tokenizer', 'specials.json')) + with package.open(('tokenizer', 'prefix.txt'), default=None) as file_: + prefix = read_prefix(file_) if file_ is not None else None + with package.open(('tokenizer', 'suffix.txt'), default=None) as file_: + suffix = read_suffix(file_) if file_ is not None else None + with package.open(('tokenizer', 'infix.txt'), default=None) as file_: + infix = read_infix(file_) if file_ is not None else None return tokenization, prefix, suffix, infix diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 1444f767e..a1d5ee8cc 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -50,13 +50,11 @@ cdef class Vocab: @classmethod def load(cls, pkg_or_str_or_file, get_lex_attr=None): package = Package.create_or_return(pkg_or_str_or_file) - with package.open(('vocab', 'tag_map.json'), default=None) as file_: - tag_map = json.load(file_) if file_ is not None else {} + tag_map = package.load_json(('vocab', 'tag_map.json'), default={}) lemmatizer = Lemmatizer.load(package) - with package.open(('vocab', 'serializer.json'), default=None) as file_: - serializer_freqs = json.load(file_) if file_ is not None else {} + serializer_freqs = package.load_json(('vocab', 'serializer.json'), default={}) cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map, lemmatizer=lemmatizer, serializer_freqs=serializer_freqs) From 3fbfba575a8ba6ebea94a8702cc176eee883ff1b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 31 Dec 2015 13:16:28 +0100 Subject: [PATCH 11/26] * xfail the contractions test --- spacy/tests/tokenizer/test_contractions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/tokenizer/test_contractions.py b/spacy/tests/tokenizer/test_contractions.py index afb1b1fe5..568e34704 100644 --- a/spacy/tests/tokenizer/test_contractions.py +++ b/spacy/tests/tokenizer/test_contractions.py @@ -50,6 +50,7 @@ def test_punct(en_tokenizer): assert len(tokens) == 3 +@pytest.mark.xfail def test_therell(en_tokenizer): tokens = en_tokenizer("there'll") assert len(tokens) == 2 From bc229790ac342c273cb9cd1124477a29732dc90b Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Wed, 13 Jan 2016 19:46:17 +0100 Subject: [PATCH 12/26] integrate with sputnik --- requirements.txt | 2 +- setup.py | 2 +- spacy/language.py | 8 +-- spacy/lemmatizer.py | 4 +- spacy/matcher.pyx | 4 +- spacy/tagger.pyx | 4 +- spacy/tests/tagger/test_lemmatizer.py | 2 +- spacy/tokenizer.pyx | 4 +- spacy/util.py | 79 +++++---------------------- spacy/vocab.pyx | 4 +- 10 files changed, 32 insertions(+), 81 deletions(-) diff --git a/requirements.txt b/requirements.txt index 98c0bbf00..b43a48752 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,4 @@ plac six ujson cloudpickle -sputnik>=0.6.4,<0.7.0 +sputnik>=0.7.0,<0.8.0 diff --git a/setup.py b/setup.py index 488885d72..a1e7dc94b 100644 --- a/setup.py +++ b/setup.py @@ -271,7 +271,7 @@ def setup_package(): ext_modules=ext_modules, install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.31', 'preshed>=0.46.1,<0.47', 'thinc>=4.2.0,<4.3.0', 'text_unidecode', 'plac', 'six', - 'ujson', 'cloudpickle', 'sputnik>=0.6.4,<0.7.0'], + 'ujson', 'cloudpickle', 'sputnik>=0.7.0,<0.8.0'], cmdclass = { 'build_ext': build_ext_subclass}, ) diff --git a/spacy/language.py b/spacy/language.py index 1dbbc09b1..fe7cabcd7 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -20,7 +20,7 @@ from .syntax.ner import BiluoPushDown from .syntax.arc_eager import ArcEager from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD -from .util import get_package, Package +from .util import get_package class Language(object): @@ -146,13 +146,13 @@ class Language(object): @classmethod def default_parser(cls, package, vocab): - data_dir = package.dir_path('deps', require=False) + data_dir = package.dir_path('deps') if data_dir and path.exists(data_dir): return Parser.from_dir(data_dir, vocab.strings, ArcEager) @classmethod def default_entity(cls, package, vocab): - data_dir = package.dir_path('ner', require=False) + data_dir = package.dir_path('ner') if data_dir and path.exists(data_dir): return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown) @@ -182,7 +182,7 @@ class Language(object): - Language(model='en_default ==1.0.0') - Language(model='en_default <1.1.0, data_dir='spacy/data') """ - package = Package(data_dir) + package = get_package(model, data_path=data_dir) if load_vectors is not True: warn("load_vectors is deprecated", DeprecationWarning) if vocab in (None, True): diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 48f23b4b4..556de3659 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -8,13 +8,13 @@ except ImportError: import json from .parts_of_speech import NOUN, VERB, ADJ, PUNCT -from .util import Package +from .util import get_package class Lemmatizer(object): @classmethod def load(cls, pkg_or_str_or_file): - pkg = Package.create_or_return(pkg_or_str_or_file) + pkg = get_package(pkg_or_str_or_file) index = {} exc = {} for pos in ['adj', 'noun', 'verb']: diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 777cdfbf3..2b7364487 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -21,7 +21,7 @@ from .tokens.doc cimport Doc from .vocab cimport Vocab from .attrs import FLAG61 as U_ENT -from .util import Package +from .util import get_package from .attrs import FLAG60 as B2_ENT from .attrs import FLAG59 as B3_ENT @@ -171,7 +171,7 @@ cdef class Matcher: @classmethod def load(cls, pkg_or_str_or_file, Vocab vocab): - package = Package.create_or_return(pkg_or_str_or_file) + package = get_package(pkg_or_str_or_file) patterns = package.load_json(('vocab', 'gazetteer.json')) return cls(vocab, patterns) diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index decf918d8..a3f8797e2 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -16,7 +16,7 @@ from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE from .attrs cimport * -from .util import Package +from .util import get_package cpdef enum: @@ -149,7 +149,7 @@ cdef class Tagger: @classmethod def load(cls, pkg_or_str_or_file, vocab): - pkg = Package.create_or_return(pkg_or_str_or_file) + pkg = get_package(pkg_or_str_or_file) # TODO: templates.json deprecated? not present in latest package templates = cls.default_templates() # templates = package.load_utf8(json.load, diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index a73c6dd4b..3de30693c 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -5,7 +5,7 @@ import io import pickle from spacy.lemmatizer import Lemmatizer, read_index, read_exc -from spacy.util import get_package, Package +from spacy.util import get_package import pytest diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 3c1f1e1ab..49e8a06ef 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -17,7 +17,7 @@ cimport cython from . import util from .tokens.doc cimport Doc from .util import read_lang_data -from .util import Package +from .util import get_package cdef class Tokenizer: @@ -43,7 +43,7 @@ cdef class Tokenizer: @classmethod def load(cls, pkg_or_str_or_file, Vocab vocab): - pkg = Package.create_or_return(pkg_or_str_or_file) + pkg = get_package(pkg_or_str_or_file) rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg) prefix_re = re.compile(prefix_re) suffix_re = re.compile(suffix_re) diff --git a/spacy/util.py b/spacy/util.py index 5f148bc01..49bbf3841 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -3,76 +3,27 @@ import io import json import re import os.path -from contextlib import contextmanager -import types +import sputnik +from sputnik.dir_package import DirPackage +from sputnik.package_stub import PackageStub + +from . import about from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE -def local_path(*dirs): - return os.path.abspath(os.path.join(os.path.dirname(__file__), *dirs)) +def get_package(value=None, data_path=None): + if data_path is None: + if isinstance(value, PackageStub): + return value + elif value and os.path.isdir(value): + return DirPackage(value) + elif value is None and data_path is not None: + return DirPackage(data_path) -class Package(object): - @classmethod - def create_or_return(cls, me_or_arg): - return me_or_arg if isinstance(me_or_arg, cls) else cls(me_or_arg) - - def __init__(self, data_path=None, model='en_default-1.0.3'): - if data_path is None: - data_path = local_path('data', model) - self.model = model - self.data_path = data_path - self._root = self.data_path - - def get(self, key): - pass - - def has_file(self, *path_parts): - return os.path.exists(os.path.join(self._root, *path_parts)) - - def file_path(self, *path_parts, **kwargs): - return os.path.join(self._root, *path_parts) - - def dir_path(self, *path_parts, **kwargs): - return os.path.join(self._root, *path_parts) - - def load_json(self, path_parts, default=None): - if not self.has_file(*path_parts): - if _is_error_class(default): - raise default(self.file_path(*path_parts)) - elif isinstance(default, Exception): - raise default - else: - return default - with io.open(self.file_path(os.path.join(*path_parts)), - mode='r', encoding='utf8') as file_: - return json.load(file_) - - @contextmanager - def open(self, path_parts, mode='r', encoding='utf8', default=IOError): - if not self.has_file(*path_parts): - if _is_error_class(default): - raise default(self.file_path(*path_parts)) - elif isinstance(default, Exception): - raise default - else: - yield default - else: - # Enter - file_ = io.open(self.file_path(os.path.join(*path_parts)), - mode=mode, encoding='utf8') - yield file_ - # Exit - file_.close() - - -def _is_error_class(e): - return isinstance(e, types.TypeType) and issubclass(e, Exception) - - -def get_package(name=None, data_path=None): - return Package(data_path) + return sputnik.package('spacy', about.short_version, + value or 'en_default==1.0.4', data_path=data_path) def normalize_slice(length, start, stop, step=None): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index a1d5ee8cc..e09cb48de 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -19,7 +19,7 @@ from .orth cimport word_shape from .typedefs cimport attr_t from .cfile cimport CFile from .lemmatizer import Lemmatizer -from .util import Package +from .util import get_package from . import attrs from . import symbols @@ -49,7 +49,7 @@ cdef class Vocab: ''' @classmethod def load(cls, pkg_or_str_or_file, get_lex_attr=None): - package = Package.create_or_return(pkg_or_str_or_file) + package = get_package(pkg_or_str_or_file) tag_map = package.load_json(('vocab', 'tag_map.json'), default={}) lemmatizer = Lemmatizer.load(package) From 9b75d872b08fe66a5a5c0a8534e36c0e252d3089 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Thu, 14 Jan 2016 12:02:56 +0100 Subject: [PATCH 13/26] fix model download --- spacy/en/download.py | 16 ++++++---------- spacy/util.py | 16 ++++++++++++++-- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/spacy/en/download.py b/spacy/en/download.py index f91b44601..bdc0ac9b0 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -3,7 +3,9 @@ import os import shutil import plac -from sputnik import Sputnik +import sputnik + +from .. import about def migrate(path): @@ -35,23 +37,17 @@ def link(package, path): force=("Force overwrite", "flag", "f", bool), ) def main(data_size='all', force=False): - # TODO read version from the same source as the setup - sputnik = Sputnik('spacy', '0.100.0', console=sys.stdout) - path = os.path.dirname(os.path.abspath(__file__)) data_path = os.path.abspath(os.path.join(path, '..', 'data')) if not os.path.isdir(data_path): os.mkdir(data_path) - command = sputnik.command( - data_path=data_path, - repository_url='https://index.spacy.io') - if force: - command.purge() + sputnik.purge('spacy', about.short_version, data_path=data_path) - package = command.install('en_default') + package = sputnik.install('spacy', about.short_version, 'en_default==1.0.4', + data_path=data_path) # FIXME clean up old-style packages migrate(path) diff --git a/spacy/util.py b/spacy/util.py index 49bbf3841..5aa4cde96 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -7,6 +7,7 @@ import os.path import sputnik from sputnik.dir_package import DirPackage from sputnik.package_stub import PackageStub +from sputnik.package_list import PackageNotFoundException, CompatiblePackageNotFoundException from . import about from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE @@ -22,8 +23,19 @@ def get_package(value=None, data_path=None): elif value is None and data_path is not None: return DirPackage(data_path) - return sputnik.package('spacy', about.short_version, - value or 'en_default==1.0.4', data_path=data_path) + try: + return sputnik.package('spacy', about.short_version, + value or 'en_default==1.0.4', + data_path=data_path) + + except PackageNotFoundException as e: + raise RuntimeError("Model not installed. Please run 'python -m " + "spacy.en.download' to install latest compatible " + "model.") + except CompatiblePackageNotFoundException as e: + raise RuntimeError("Installed model is not compatible with spaCy " + "version. Please run 'python -m spacy.en.download " + "--force' to install latest compatible model.") def normalize_slice(length, start, stop, step=None): From d9471f684fbfb4776bfaf2a4e2265c19dd758db2 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Thu, 14 Jan 2016 12:14:12 +0100 Subject: [PATCH 14/26] fix typo --- spacy/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index 5aa4cde96..c998df056 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -34,7 +34,7 @@ def get_package(value=None, data_path=None): "model.") except CompatiblePackageNotFoundException as e: raise RuntimeError("Installed model is not compatible with spaCy " - "version. Please run 'python -m spacy.en.download " + "version. Please run 'python -m spacy.en.download' " "--force' to install latest compatible model.") From 04e67e87151f6dc85e91649643c3e6ee8387670e Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Thu, 14 Jan 2016 15:24:51 +0100 Subject: [PATCH 15/26] fix package.json --- package.json | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/package.json b/package.json index 0710031ed..aab453cbd 100644 --- a/package.json +++ b/package.json @@ -1,17 +1,14 @@ { - "name": "en_default", - "version": "0.100.0", - "description": "english default model", + "name": "en_test", + "version": "1.0.0", + "description": "english test model", "license": "public domain", "include": [ - "deps/*", - "ner/*", - "pos/*", - "tokenizer/*", - "vocab/*", - "wordnet/*" - ], - "compatibility": { - "spacy": "==0.100.0" - } + ["deps", "*"], + ["ner", "*"], + ["pos", "*"], + ["tokenizer", "*"], + ["vocab", "*"], + ["wordnet", "*"] + ] } From 788f7345139e4526d3002121705b7e35e742bcb6 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 15 Jan 2016 18:01:02 +0100 Subject: [PATCH 16/26] refactored data_dir->via, add zip_safe, add spacy.load() --- setup.py | 1 + spacy/__init__.py | 7 +++++++ spacy/en/download.py | 21 +++++++++++++------- spacy/language.py | 47 ++++++++++++++++++++++++++++++-------------- spacy/lemmatizer.py | 4 ++-- spacy/matcher.pyx | 4 ++-- spacy/tagger.pyx | 4 ++-- spacy/tokenizer.pyx | 4 ++-- spacy/util.py | 23 +++++++++------------- spacy/vocab.pyx | 4 ++-- 10 files changed, 73 insertions(+), 46 deletions(-) diff --git a/setup.py b/setup.py index a1e7dc94b..349e85b6b 100644 --- a/setup.py +++ b/setup.py @@ -260,6 +260,7 @@ def setup_package(): setup( name='spacy', + zip_safe=False, packages=PACKAGES, package_data={'': ['*.pyx', '*.pxd']}, description='Industrial-strength NLP', diff --git a/spacy/__init__.py b/spacy/__init__.py index e69de29bb..57d02d95f 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -0,0 +1,7 @@ +from . import util +from .en import English + + +def load(name, via=None): + package = util.get_package_by_name(name, via=via) + return English(package) diff --git a/spacy/en/download.py b/spacy/en/download.py index bdc0ac9b0..3195aa127 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -1,3 +1,5 @@ +from __future__ import print_function + import sys import os import shutil @@ -37,21 +39,26 @@ def link(package, path): force=("Force overwrite", "flag", "f", bool), ) def main(data_size='all', force=False): + package_name = 'en_default==1.0.4' path = os.path.dirname(os.path.abspath(__file__)) - data_path = os.path.abspath(os.path.join(path, '..', 'data')) - if not os.path.isdir(data_path): - os.mkdir(data_path) - if force: - sputnik.purge('spacy', about.short_version, data_path=data_path) + sputnik.purge('spacy', about.short_version) - package = sputnik.install('spacy', about.short_version, 'en_default==1.0.4', - data_path=data_path) + package = sputnik.install('spacy', about.short_version, package_name) + + try: + sputnik.package('spacy', about.short_version, package_name) + except PackageNotFoundException, CompatiblePackageNotFoundException: + print("Model failed to install. Please run 'python -m " + "spacy.en.download --force'.", file=sys.stderr) + sys.exit(1) # FIXME clean up old-style packages migrate(path) + print("Model successfully installed.", file=sys.stderr) + if __name__ == '__main__': plac.call(main) diff --git a/spacy/language.py b/spacy/language.py index fe7cabcd7..8c86d53dd 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -8,6 +8,9 @@ try: except ImportError: import json +import sputnik +from sputnik.dir_package import DirPackage + from .tokenizer import Tokenizer from .vocab import Vocab from .syntax.parser import Parser @@ -19,8 +22,9 @@ from . import orth from .syntax.ner import BiluoPushDown from .syntax.arc_eager import ArcEager +from . import about +from . import util from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD -from .util import get_package class Language(object): @@ -137,9 +141,7 @@ class Language(object): return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}} @classmethod - def default_vocab(cls, package=None, get_lex_attr=None): - if package is None: - package = get_package() + def default_vocab(cls, package, get_lex_attr=None): if get_lex_attr is None: get_lex_attr = cls.default_lex_attrs() return Vocab.load(package, get_lex_attr=get_lex_attr) @@ -157,8 +159,8 @@ class Language(object): return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown) def __init__(self, + via=None, data_dir=None, - model=None, vocab=None, tokenizer=None, tagger=None, @@ -170,19 +172,34 @@ class Language(object): """ a model can be specified: - 1) by a path to the model directory (DEPRECATED) - - Language(data_dir='path/to/data') + 1) by calling a Language subclass + - spacy.en.English() - 2) by a language identifier (and optionally a package root dir) - - Language(lang='en') - - Language(lang='en', data_dir='spacy/data') + 2) by calling a Language subclass with via (previously: data_dir) + - spacy.en.English('my/model/root') + - spacy.en.English(via='my/model/root') - 3) by a model name/version (and optionally a package root dir) - - Language(model='en_default') - - Language(model='en_default ==1.0.0') - - Language(model='en_default <1.1.0, data_dir='spacy/data') + 3) by package name + - spacy.load('en_default') + - spacy.load('en_default==1.0.0') + + 4) by package name with a relocated package base + - spacy.load('en_default', via='/my/package/root') + - spacy.load('en_default==1.0.0', via='/my/package/root') + + 5) by package object + - spacy.en.English(package) """ - package = get_package(model, data_path=data_dir) + + if data_dir is not None and via is None: + warn("Use of data_dir is deprecated, use via instead.", DeprecationWarning) + via = data_dir + + if via is None: + package = util.get_package_by_name('en_default==1.0.4') + else: + package = util.get_package(via) + if load_vectors is not True: warn("load_vectors is deprecated", DeprecationWarning) if vocab in (None, True): diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 556de3659..5082da253 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -13,8 +13,8 @@ from .util import get_package class Lemmatizer(object): @classmethod - def load(cls, pkg_or_str_or_file): - pkg = get_package(pkg_or_str_or_file) + def load(cls, via): + pkg = get_package(via) index = {} exc = {} for pos in ['adj', 'noun', 'verb']: diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 2b7364487..df71e8f98 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -170,8 +170,8 @@ cdef class Matcher: cdef object _patterns @classmethod - def load(cls, pkg_or_str_or_file, Vocab vocab): - package = get_package(pkg_or_str_or_file) + def load(cls, via, Vocab vocab): + package = get_package(via) patterns = package.load_json(('vocab', 'gazetteer.json')) return cls(vocab, patterns) diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index a3f8797e2..7d7b82d90 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -148,8 +148,8 @@ cdef class Tagger: return cls(vocab, model) @classmethod - def load(cls, pkg_or_str_or_file, vocab): - pkg = get_package(pkg_or_str_or_file) + def load(cls, via, vocab): + pkg = get_package(via) # TODO: templates.json deprecated? not present in latest package templates = cls.default_templates() # templates = package.load_utf8(json.load, diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 49e8a06ef..9f195f784 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -42,8 +42,8 @@ cdef class Tokenizer: return (self.__class__, args, None, None) @classmethod - def load(cls, pkg_or_str_or_file, Vocab vocab): - pkg = get_package(pkg_or_str_or_file) + def load(cls, via, Vocab vocab): + pkg = get_package(via) rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg) prefix_re = re.compile(prefix_re) suffix_re = re.compile(suffix_re) diff --git a/spacy/util.py b/spacy/util.py index c998df056..27d1fe161 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -7,34 +7,29 @@ import os.path import sputnik from sputnik.dir_package import DirPackage from sputnik.package_stub import PackageStub -from sputnik.package_list import PackageNotFoundException, CompatiblePackageNotFoundException +from sputnik.package_list import (PackageNotFoundException, + CompatiblePackageNotFoundException) from . import about from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE -def get_package(value=None, data_path=None): - if data_path is None: - if isinstance(value, PackageStub): - return value - elif value and os.path.isdir(value): - return DirPackage(value) +def get_package(via=None): + if isinstance(via, PackageStub): + return via + return DirPackage(via) - elif value is None and data_path is not None: - return DirPackage(data_path) +def get_package_by_name(name, via=None): try: - return sputnik.package('spacy', about.short_version, - value or 'en_default==1.0.4', - data_path=data_path) - + return sputnik.package('spacy', about.short_version, name, data_path=via) except PackageNotFoundException as e: raise RuntimeError("Model not installed. Please run 'python -m " "spacy.en.download' to install latest compatible " "model.") except CompatiblePackageNotFoundException as e: raise RuntimeError("Installed model is not compatible with spaCy " - "version. Please run 'python -m spacy.en.download' " + "version. Please run 'python -m spacy.en.download " "--force' to install latest compatible model.") diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index e09cb48de..f9771d5f7 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -48,8 +48,8 @@ cdef class Vocab: '''A map container for a language's LexemeC structs. ''' @classmethod - def load(cls, pkg_or_str_or_file, get_lex_attr=None): - package = get_package(pkg_or_str_or_file) + def load(cls, via, get_lex_attr=None): + package = get_package(via) tag_map = package.load_json(('vocab', 'tag_map.json'), default={}) lemmatizer = Lemmatizer.load(package) From 780cb847c952ca2a836e90907c36836c53d3cf34 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 15 Jan 2016 18:07:15 +0100 Subject: [PATCH 17/26] add default_model to about --- setup.py | 1 + spacy/en/download.py | 5 ++--- spacy/language.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 349e85b6b..01035b078 100644 --- a/setup.py +++ b/setup.py @@ -158,6 +158,7 @@ version = '%(version)s' full_version = '%(full_version)s' git_revision = '%(git_revision)s' release = %(isrelease)s +default_model = 'en_default==1.0.4' if not release: version = full_version """ diff --git a/spacy/en/download.py b/spacy/en/download.py index 3195aa127..1b50feeb6 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -39,16 +39,15 @@ def link(package, path): force=("Force overwrite", "flag", "f", bool), ) def main(data_size='all', force=False): - package_name = 'en_default==1.0.4' path = os.path.dirname(os.path.abspath(__file__)) if force: sputnik.purge('spacy', about.short_version) - package = sputnik.install('spacy', about.short_version, package_name) + package = sputnik.install('spacy', about.short_version, about.default_model) try: - sputnik.package('spacy', about.short_version, package_name) + sputnik.package('spacy', about.short_version, about.default_model) except PackageNotFoundException, CompatiblePackageNotFoundException: print("Model failed to install. Please run 'python -m " "spacy.en.download --force'.", file=sys.stderr) diff --git a/spacy/language.py b/spacy/language.py index 8c86d53dd..ab6f95644 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -196,7 +196,7 @@ class Language(object): via = data_dir if via is None: - package = util.get_package_by_name('en_default==1.0.4') + package = util.get_package_by_name(about.default_model) else: package = util.get_package(via) From ccd87ad7fb8066ba11b479a4bbab5597d4c1dd49 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 15 Jan 2016 18:12:01 +0100 Subject: [PATCH 18/26] add default_model to about --- setup.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index 01035b078..13fbf2c42 100644 --- a/setup.py +++ b/setup.py @@ -14,11 +14,12 @@ except ImportError: from distutils.core import Extension, setup -MAJOR = 0 -MINOR = 100 -MICRO = 0 -ISRELEASED = False -VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) +MAJOR = 0 +MINOR = 100 +MICRO = 0 +ISRELEASE = False +VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) +DEFAULT_MODEL = 'en_default==1.0.4' PACKAGES = [ @@ -145,7 +146,7 @@ def get_version_info(): else: GIT_REVISION = 'Unknown' - if not ISRELEASED: + if not ISRELEASE: FULLVERSION += '.dev0+' + GIT_REVISION[:7] return FULLVERSION, GIT_REVISION @@ -158,7 +159,7 @@ version = '%(version)s' full_version = '%(full_version)s' git_revision = '%(git_revision)s' release = %(isrelease)s -default_model = 'en_default==1.0.4' +default_model = '%(default_model)s' if not release: version = full_version """ @@ -168,7 +169,8 @@ if not release: f.write(cnt % {'version': VERSION, 'full_version' : FULLVERSION, 'git_revision' : GIT_REVISION, - 'isrelease': str(ISRELEASED)}) + 'isrelease': str(ISRELEASE), + 'default_model': DEFAULT_MODEL}) def generate_cython(root, source): From f8a8f97d25fc18d487f1de5f5e5411861ea614fa Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 15 Jan 2016 18:13:37 +0100 Subject: [PATCH 19/26] cleanup --- spacy/language.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index ab6f95644..11ae91f69 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -8,9 +8,6 @@ try: except ImportError: import json -import sputnik -from sputnik.dir_package import DirPackage - from .tokenizer import Tokenizer from .vocab import Vocab from .syntax.parser import Parser From 211913d689943b38a42dd7bd47bd8c5608fe439d Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 15 Jan 2016 18:57:01 +0100 Subject: [PATCH 20/26] add about.py, adapt setup.py --- setup.py | 97 +++++--------------------------------------- spacy/about.py | 14 +++++++ spacy/en/download.py | 6 +-- spacy/language.py | 3 +- spacy/util.py | 5 ++- 5 files changed, 32 insertions(+), 93 deletions(-) create mode 100644 spacy/about.py diff --git a/setup.py b/setup.py index 13fbf2c42..ce189985b 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -from __future__ import division, print_function +from __future__ import print_function import os import shutil import subprocess @@ -14,14 +14,6 @@ except ImportError: from distutils.core import Extension, setup -MAJOR = 0 -MINOR = 100 -MICRO = 0 -ISRELEASE = False -VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) -DEFAULT_MODEL = 'en_default==1.0.4' - - PACKAGES = [ 'spacy', 'spacy.tokens', @@ -104,75 +96,6 @@ class build_ext_subclass(build_ext, build_ext_options): build_ext.build_extensions(self) -# Return the git revision as a string -def git_version(): - def _minimal_ext_cmd(cmd): - # construct minimal environment - env = {} - for k in ['SYSTEMROOT', 'PATH']: - v = os.environ.get(k) - if v is not None: - env[k] = v - # LANGUAGE is used on win32 - env['LANGUAGE'] = 'C' - env['LANG'] = 'C' - env['LC_ALL'] = 'C' - out = subprocess.Popen(cmd, stdout = subprocess.PIPE, env=env).communicate()[0] - return out - - try: - out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD']) - GIT_REVISION = out.strip().decode('ascii') - except OSError: - GIT_REVISION = 'Unknown' - - return GIT_REVISION - - -def get_version_info(): - # Adding the git rev number needs to be done inside write_version_py(), - # otherwise the import of spacy.about messes up the build under Python 3. - FULLVERSION = VERSION - if os.path.exists('.git'): - GIT_REVISION = git_version() - elif os.path.exists(os.path.join('spacy', 'about.py')): - # must be a source distribution, use existing version file - try: - from spacy.about import git_revision as GIT_REVISION - except ImportError: - raise ImportError('Unable to import git_revision. Try removing ' - 'spacy/about.py and the build directory ' - 'before building.') - else: - GIT_REVISION = 'Unknown' - - if not ISRELEASE: - FULLVERSION += '.dev0+' + GIT_REVISION[:7] - - return FULLVERSION, GIT_REVISION - - -def write_version(path): - cnt = """# THIS FILE IS GENERATED FROM SPACY SETUP.PY -short_version = '%(version)s' -version = '%(version)s' -full_version = '%(full_version)s' -git_revision = '%(git_revision)s' -release = %(isrelease)s -default_model = '%(default_model)s' -if not release: - version = full_version -""" - FULLVERSION, GIT_REVISION = get_version_info() - - with open(path, 'w') as f: - f.write(cnt % {'version': VERSION, - 'full_version' : FULLVERSION, - 'git_revision' : GIT_REVISION, - 'isrelease': str(ISRELEASE), - 'default_model': DEFAULT_MODEL}) - - def generate_cython(root, source): print('Cythonizing sources') p = subprocess.call([sys.executable, @@ -244,7 +167,9 @@ def setup_package(): return clean(root) with chdir(root): - write_version(os.path.join(root, 'spacy', 'about.py')) + about = {} + with open(os.path.join(root, "spacy", "about.py")) as f: + exec(f.read(), about) include_dirs = [ get_python_inc(plat_specific=True), @@ -262,16 +187,16 @@ def setup_package(): prepare_includes(root) setup( - name='spacy', + name=about['__name__'], zip_safe=False, packages=PACKAGES, package_data={'': ['*.pyx', '*.pxd']}, - description='Industrial-strength NLP', - author='Matthew Honnibal', - author_email='matt@spacy.io', - version=VERSION, - url='https://spacy.io', - license='MIT', + description=about['__summary__'], + author=about['__author__'], + author_email=about['__email__'], + version=about['__version__'], + url=about['__uri__'], + license=about['__license__'], ext_modules=ext_modules, install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.31', 'preshed>=0.46.1,<0.47', 'thinc>=4.2.0,<4.3.0', 'text_unidecode', 'plac', 'six', diff --git a/spacy/about.py b/spacy/about.py new file mode 100644 index 000000000..6ad68f5ba --- /dev/null +++ b/spacy/about.py @@ -0,0 +1,14 @@ +# inspired from: + +# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/ +# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py + +__name__ = 'spacy' +__version__ = '0.100.0' +__summary__ = 'Industrial-strength NLP' +__uri__ = 'https://spacy.io' +__author__ = 'Matthew Honnibal' +__email__ = 'matt@spacy.io' +__license__ = 'MIT' +__release__ = False +__default_model__ = 'en_default==1.0.4' diff --git a/spacy/en/download.py b/spacy/en/download.py index 1b50feeb6..8d48405fe 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -42,12 +42,12 @@ def main(data_size='all', force=False): path = os.path.dirname(os.path.abspath(__file__)) if force: - sputnik.purge('spacy', about.short_version) + sputnik.purge(about.__name__, about.__version__) - package = sputnik.install('spacy', about.short_version, about.default_model) + package = sputnik.install(about.__name__, about.__version__, about.__default_model__) try: - sputnik.package('spacy', about.short_version, about.default_model) + sputnik.package(about.__name__, about.__version__, about.__default_model__) except PackageNotFoundException, CompatiblePackageNotFoundException: print("Model failed to install. Please run 'python -m " "spacy.en.download --force'.", file=sys.stderr) diff --git a/spacy/language.py b/spacy/language.py index 11ae91f69..24e716265 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -19,7 +19,6 @@ from . import orth from .syntax.ner import BiluoPushDown from .syntax.arc_eager import ArcEager -from . import about from . import util from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD @@ -193,7 +192,7 @@ class Language(object): via = data_dir if via is None: - package = util.get_package_by_name(about.default_model) + package = util.get_package_by_name() else: package = util.get_package(via) diff --git a/spacy/util.py b/spacy/util.py index 27d1fe161..5083fa37a 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -20,9 +20,10 @@ def get_package(via=None): return DirPackage(via) -def get_package_by_name(name, via=None): +def get_package_by_name(name=None, via=None): try: - return sputnik.package('spacy', about.short_version, name, data_path=via) + return sputnik.package(about.__name__, about.__version__, + name or about.__default_model__, data_path=via) except PackageNotFoundException as e: raise RuntimeError("Model not installed. Please run 'python -m " "spacy.en.download' to install latest compatible " From 634ea57876f9bd64a4dc57688edbac4c31953f1f Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Sat, 16 Jan 2016 09:56:12 +0100 Subject: [PATCH 21/26] adapt travis/appveyor to latest sputnik --- .appveyor.yml | 2 +- .travis.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index d4cab45cd..0fc7dad50 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -61,7 +61,7 @@ build_script: - "%CMD_IN_ENV% python bin/init_model.py en lang_data/ corpora/ data" - "cp package.json data" - "%CMD_IN_ENV% sputnik build data en_default.sputnik" - - "%CMD_IN_ENV% sputnik install en_default.sputnik" + - "%CMD_IN_ENV% sputnik --name spacy install en_default.sputnik" test_script: # Run the project tests diff --git a/.travis.yml b/.travis.yml index b6c2a430e..e89dd19e4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -31,7 +31,7 @@ install: - "python bin/init_model.py en lang_data/ corpora/ data" - "cp package.json data" - "sputnik build data en_default.sputnik" - - "sputnik install en_default.sputnik" + - "sputnik --name spacy install en_default.sputnik" script: - python build.py $MODE; From 846fa49b2ab5d69409f4b9861baaba36e2fb9404 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Sat, 16 Jan 2016 10:00:57 +0100 Subject: [PATCH 22/26] distinct load() and from_package() methods --- spacy/__init__.py | 2 +- spacy/language.py | 26 +++++++++++++------------- spacy/lemmatizer.py | 5 ++++- spacy/matcher.pyx | 5 ++++- spacy/tagger.pyx | 5 ++++- spacy/tokenizer.pyx | 7 +++++-- spacy/util.py | 6 +++--- spacy/vocab.pyx | 7 +++++-- 8 files changed, 39 insertions(+), 24 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 57d02d95f..556027a42 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -4,4 +4,4 @@ from .en import English def load(name, via=None): package = util.get_package_by_name(name, via=via) - return English(package) + return English(package=package) diff --git a/spacy/language.py b/spacy/language.py index 24e716265..8f3eb646d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -140,7 +140,7 @@ class Language(object): def default_vocab(cls, package, get_lex_attr=None): if get_lex_attr is None: get_lex_attr = cls.default_lex_attrs() - return Vocab.load(package, get_lex_attr=get_lex_attr) + return Vocab.from_package(package, get_lex_attr=get_lex_attr) @classmethod def default_parser(cls, package, vocab): @@ -164,7 +164,8 @@ class Language(object): entity=None, matcher=None, serializer=None, - load_vectors=True): + load_vectors=True, + package=None): """ a model can be specified: @@ -182,30 +183,29 @@ class Language(object): 4) by package name with a relocated package base - spacy.load('en_default', via='/my/package/root') - spacy.load('en_default==1.0.0', via='/my/package/root') - - 5) by package object - - spacy.en.English(package) """ if data_dir is not None and via is None: warn("Use of data_dir is deprecated, use via instead.", DeprecationWarning) via = data_dir - if via is None: - package = util.get_package_by_name() - else: - package = util.get_package(via) + if package is None: + if via is None: + package = util.get_package_by_name() + else: + package = util.get_package(via) if load_vectors is not True: warn("load_vectors is deprecated", DeprecationWarning) + if vocab in (None, True): - vocab = Vocab.load(package, get_lex_attr=self.default_lex_attrs()) + vocab = self.default_vocab(package) self.vocab = vocab if tokenizer in (None, True): - tokenizer = Tokenizer.load(package, self.vocab) + tokenizer = Tokenizer.from_package(package, self.vocab) self.tokenizer = tokenizer if tagger in (None, True): - tagger = Tagger.load(package, self.vocab) + tagger = Tagger.from_package(package, self.vocab) self.tagger = tagger if entity in (None, True): entity = self.default_entity(package, self.vocab) @@ -214,7 +214,7 @@ class Language(object): parser = self.default_parser(package, self.vocab) self.parser = parser if matcher in (None, True): - matcher = Matcher.load(package, self.vocab) + matcher = Matcher.from_package(package, self.vocab) self.matcher = matcher def __reduce__(self): diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 5082da253..a05ca49c0 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -14,7 +14,10 @@ from .util import get_package class Lemmatizer(object): @classmethod def load(cls, via): - pkg = get_package(via) + return cls.from_package(get_package(via)) + + @classmethod + def from_package(cls, pkg): index = {} exc = {} for pos in ['adj', 'noun', 'verb']: diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index df71e8f98..098d6cd5d 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -171,7 +171,10 @@ cdef class Matcher: @classmethod def load(cls, via, Vocab vocab): - package = get_package(via) + return cls.from_package(get_package(via), vocab=vocab) + + @classmethod + def from_package(cls, package, Vocab vocab): patterns = package.load_json(('vocab', 'gazetteer.json')) return cls(vocab, patterns) diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 7d7b82d90..0d4252d6c 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -149,7 +149,10 @@ cdef class Tagger: @classmethod def load(cls, via, vocab): - pkg = get_package(via) + return cls.from_package(get_package(via), vocab=vocab) + + @classmethod + def from_package(cls, pkg, vocab): # TODO: templates.json deprecated? not present in latest package templates = cls.default_templates() # templates = package.load_utf8(json.load, diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 9f195f784..fbb54c248 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -43,8 +43,11 @@ cdef class Tokenizer: @classmethod def load(cls, via, Vocab vocab): - pkg = get_package(via) - rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg) + return cls.from_package(get_package(via), vocab=vocab) + + @classmethod + def from_package(cls, package, Vocab vocab): + rules, prefix_re, suffix_re, infix_re = read_lang_data(package) prefix_re = re.compile(prefix_re) suffix_re = re.compile(suffix_re) infix_re = re.compile(infix_re) diff --git a/spacy/util.py b/spacy/util.py index 5083fa37a..24b683f0a 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -4,9 +4,9 @@ import json import re import os.path +import six import sputnik from sputnik.dir_package import DirPackage -from sputnik.package_stub import PackageStub from sputnik.package_list import (PackageNotFoundException, CompatiblePackageNotFoundException) @@ -15,8 +15,8 @@ from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE def get_package(via=None): - if isinstance(via, PackageStub): - return via + if not isinstance(via, six.string_types): + raise RuntimeError('via must be a string') return DirPackage(via) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index f9771d5f7..3e7dbf38d 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -49,10 +49,13 @@ cdef class Vocab: ''' @classmethod def load(cls, via, get_lex_attr=None): - package = get_package(via) + return cls.from_package(get_package(via), get_lex_attr=get_lex_attr) + + @classmethod + def from_package(cls, package, get_lex_attr=None): tag_map = package.load_json(('vocab', 'tag_map.json'), default={}) - lemmatizer = Lemmatizer.load(package) + lemmatizer = Lemmatizer.from_package(package) serializer_freqs = package.load_json(('vocab', 'serializer.json'), default={}) From 6d1a3af34358ccfdcb046a9c95f602904a46fb73 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Sat, 16 Jan 2016 10:05:04 +0100 Subject: [PATCH 23/26] cleanup unused --- spacy/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index 24b683f0a..5a354e09a 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -14,7 +14,7 @@ from . import about from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE -def get_package(via=None): +def get_package(): if not isinstance(via, six.string_types): raise RuntimeError('via must be a string') return DirPackage(via) From 235f0945342419baaca63d8de3c56f4077dc970e Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Sat, 16 Jan 2016 12:23:45 +0100 Subject: [PATCH 24/26] untangle data_path/via --- spacy/en/download.py | 27 +++++++++++---------------- spacy/language.py | 13 ++++--------- spacy/matcher.pyx | 4 ++-- spacy/tagger.pyx | 4 ++-- spacy/tests/conftest.py | 8 ++++---- spacy/tests/serialize/test_packer.py | 11 +++++++---- spacy/tests/tagger/test_lemmatizer.py | 12 ++++++------ spacy/tests/website/conftest.py | 6 +++--- spacy/tests/website/test_home.py | 8 ++++---- spacy/tokenizer.pyx | 4 ++-- spacy/util.py | 8 ++++---- spacy/vocab.pyx | 4 ++-- 12 files changed, 51 insertions(+), 58 deletions(-) diff --git a/spacy/en/download.py b/spacy/en/download.py index 8d48405fe..7e017b9c0 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -6,6 +6,8 @@ import shutil import plac import sputnik +from sputnik.package_list import (PackageNotFoundException, + CompatiblePackageNotFoundException) from .. import about @@ -22,28 +24,21 @@ def migrate(path): os.unlink(os.path.join(path, filename)) -def link(package, path): - if os.path.exists(path): - if os.path.isdir(path): - shutil.rmtree(path) - else: - os.unlink(path) - - if not hasattr(os, 'symlink'): # not supported by win+py27 - shutil.copytree(package.dir_path('data'), path) - else: - os.symlink(package.dir_path('data'), path) - - @plac.annotations( force=("Force overwrite", "flag", "f", bool), ) def main(data_size='all', force=False): - path = os.path.dirname(os.path.abspath(__file__)) - if force: sputnik.purge(about.__name__, about.__version__) + try: + sputnik.package(about.__name__, about.__version__, about.__default_model__) + print("Model already installed. Please run 'python -m " + "spacy.en.download --force' to reinstall.", file=sys.stderr) + sys.exit(1) + except PackageNotFoundException, CompatiblePackageNotFoundException: + pass + package = sputnik.install(about.__name__, about.__version__, about.__default_model__) try: @@ -54,7 +49,7 @@ def main(data_size='all', force=False): sys.exit(1) # FIXME clean up old-style packages - migrate(path) + migrate(os.path.dirname(os.path.abspath(__file__))) print("Model successfully installed.", file=sys.stderr) diff --git a/spacy/language.py b/spacy/language.py index 8f3eb646d..e97076b77 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -155,7 +155,6 @@ class Language(object): return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown) def __init__(self, - via=None, data_dir=None, vocab=None, tokenizer=None, @@ -172,9 +171,9 @@ class Language(object): 1) by calling a Language subclass - spacy.en.English() - 2) by calling a Language subclass with via (previously: data_dir) + 2) by calling a Language subclass with data_dir - spacy.en.English('my/model/root') - - spacy.en.English(via='my/model/root') + - spacy.en.English(data_dir='my/model/root') 3) by package name - spacy.load('en_default') @@ -185,15 +184,11 @@ class Language(object): - spacy.load('en_default==1.0.0', via='/my/package/root') """ - if data_dir is not None and via is None: - warn("Use of data_dir is deprecated, use via instead.", DeprecationWarning) - via = data_dir - if package is None: - if via is None: + if data_dir is None: package = util.get_package_by_name() else: - package = util.get_package(via) + package = util.get_package(data_dir) if load_vectors is not True: warn("load_vectors is deprecated", DeprecationWarning) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 098d6cd5d..cef98c068 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -170,8 +170,8 @@ cdef class Matcher: cdef object _patterns @classmethod - def load(cls, via, Vocab vocab): - return cls.from_package(get_package(via), vocab=vocab) + def load(cls, data_dir, Vocab vocab): + return cls.from_package(get_package(data_dir), vocab=vocab) @classmethod def from_package(cls, package, Vocab vocab): diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 0d4252d6c..493cc4f99 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -148,8 +148,8 @@ cdef class Tagger: return cls(vocab, model) @classmethod - def load(cls, via, vocab): - return cls.from_package(get_package(via), vocab=vocab) + def load(cls, data_dir, vocab): + return cls.from_package(get_package(data_dir), vocab=vocab) @classmethod def from_package(cls, pkg, vocab): diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index b8a620d88..83a39a03a 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -7,11 +7,11 @@ import os @pytest.fixture(scope="session") def EN(): if os.environ.get('SPACY_DATA'): - data_path = os.environ.get('SPACY_DATA') + data_dir = os.environ.get('SPACY_DATA') else: - data_path = None - print("Load EN from %s" % data_path) - return English(data_dir=data_path) + data_dir = None + print("Load EN from %s" % data_dir) + return English(data_dir=data_dir) def pytest_addoption(parser): diff --git a/spacy/tests/serialize/test_packer.py b/spacy/tests/serialize/test_packer.py index 1d3b12117..392cba8e3 100644 --- a/spacy/tests/serialize/test_packer.py +++ b/spacy/tests/serialize/test_packer.py @@ -13,6 +13,7 @@ from spacy.tokenizer import Tokenizer from os import path import os +from spacy import util from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD from spacy.serialize.packer import Packer @@ -21,11 +22,13 @@ from spacy.serialize.bits import BitArray @pytest.fixture def vocab(): - if os.environ.get('SPACY_DATA'): - data_path = os.environ.get('SPACY_DATA') + data_dir = os.environ.get('SPACY_DATA') + if data_dir is None: + package = util.get_package_by_name() else: - data_path = None - vocab = English.default_vocab(package=data_path) + package = util.get_package(data_dir) + + vocab = English.default_vocab(package=package) lex = vocab['dog'] assert vocab[vocab.strings['dog']].orth_ == 'dog' lex = vocab['the'] diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index 3de30693c..af85645a6 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -5,23 +5,23 @@ import io import pickle from spacy.lemmatizer import Lemmatizer, read_index, read_exc -from spacy.util import get_package +from spacy import util import pytest @pytest.fixture def package(): - if os.environ.get('SPACY_DATA'): - data_path = os.environ.get('SPACY_DATA') + data_dir = os.environ.get('SPACY_DATA') + if data_dir is None: + return util.get_package_by_name() else: - data_path = None - return get_package(data_path=data_path) + return util.get_package(data_dir) @pytest.fixture def lemmatizer(package): - return Lemmatizer.load(package) + return Lemmatizer.from_package(package) def test_read_index(package): diff --git a/spacy/tests/website/conftest.py b/spacy/tests/website/conftest.py index d7b4b3252..e2c64cfd7 100644 --- a/spacy/tests/website/conftest.py +++ b/spacy/tests/website/conftest.py @@ -7,10 +7,10 @@ import os def nlp(): from spacy.en import English if os.environ.get('SPACY_DATA'): - data_path = os.environ.get('SPACY_DATA') + data_dir = os.environ.get('SPACY_DATA') else: - data_path = None - return English(data_dir=data_path) + data_dir = None + return English(data_dir=data_dir) @pytest.fixture() diff --git a/spacy/tests/website/test_home.py b/spacy/tests/website/test_home.py index ef13b4677..3d9aa5dd6 100644 --- a/spacy/tests/website/test_home.py +++ b/spacy/tests/website/test_home.py @@ -11,13 +11,13 @@ def token(doc): def test_load_resources_and_process_text(): if os.environ.get('SPACY_DATA'): - data_path = os.environ.get('SPACY_DATA') + data_dir = os.environ.get('SPACY_DATA') else: - data_path = None - print("Load EN from %s" % data_path) + data_dir = None + print("Load EN from %s" % data_dir) from spacy.en import English - nlp = English(data_dir=data_path) + nlp = English(data_dir=data_dir) doc = nlp('Hello, world. Here are two sentences.') diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index fbb54c248..593d0dc7d 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -42,8 +42,8 @@ cdef class Tokenizer: return (self.__class__, args, None, None) @classmethod - def load(cls, via, Vocab vocab): - return cls.from_package(get_package(via), vocab=vocab) + def load(cls, data_dir, Vocab vocab): + return cls.from_package(get_package(data_dir), vocab=vocab) @classmethod def from_package(cls, package, Vocab vocab): diff --git a/spacy/util.py b/spacy/util.py index 5a354e09a..390c83a03 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -14,10 +14,10 @@ from . import about from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE -def get_package(): - if not isinstance(via, six.string_types): - raise RuntimeError('via must be a string') - return DirPackage(via) +def get_package(data_dir): + if not isinstance(data_dir, six.string_types): + raise RuntimeError('data_dir must be a string') + return DirPackage(data_dir) def get_package_by_name(name=None, via=None): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 3e7dbf38d..f4750dcb5 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -48,8 +48,8 @@ cdef class Vocab: '''A map container for a language's LexemeC structs. ''' @classmethod - def load(cls, via, get_lex_attr=None): - return cls.from_package(get_package(via), get_lex_attr=get_lex_attr) + def load(cls, data_dir, get_lex_attr=None): + return cls.from_package(get_package(data_dir), get_lex_attr=get_lex_attr) @classmethod def from_package(cls, package, get_lex_attr=None): From 5551052840833ef9a5c2f1f679dd390e0a035a0d Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Sat, 16 Jan 2016 12:44:53 +0100 Subject: [PATCH 25/26] fix py2/3 issue --- spacy/en/download.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/en/download.py b/spacy/en/download.py index 7e017b9c0..ae1e62e44 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -36,14 +36,14 @@ def main(data_size='all', force=False): print("Model already installed. Please run 'python -m " "spacy.en.download --force' to reinstall.", file=sys.stderr) sys.exit(1) - except PackageNotFoundException, CompatiblePackageNotFoundException: + except (PackageNotFoundException, CompatiblePackageNotFoundException): pass package = sputnik.install(about.__name__, about.__version__, about.__default_model__) try: sputnik.package(about.__name__, about.__version__, about.__default_model__) - except PackageNotFoundException, CompatiblePackageNotFoundException: + except (PackageNotFoundException, CompatiblePackageNotFoundException): print("Model failed to install. Please run 'python -m " "spacy.en.download --force'.", file=sys.stderr) sys.exit(1) From 41ea14a56fed1cf9f33bc913c00ec77f67671312 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Sat, 16 Jan 2016 13:23:11 +0100 Subject: [PATCH 26/26] fix pickling --- spacy/language.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index e97076b77..8efcc618e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -215,7 +215,6 @@ class Language(object): def __reduce__(self): args = ( None, # data_dir - None, # model self.vocab, self.tokenizer, self.tagger,