From 8359bd4d93f7b9d1bf5b76adb45aa36bfbfbb499 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 18 Dec 2015 09:52:55 +0100 Subject: [PATCH] strip data/ from package, friendlier Language invocation, make data_dir backward/forward-compatible --- spacy/en/__init__.py | 4 ++ spacy/language.py | 94 +++++++++++++++++++++------ spacy/lemmatizer.py | 6 +- spacy/matcher.pyx | 2 +- spacy/tagger.pyx | 11 ++-- spacy/tests/tagger/test_lemmatizer.py | 4 +- spacy/util.py | 28 ++++---- spacy/vocab.pyx | 14 ++-- 8 files changed, 112 insertions(+), 51 deletions(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 309deae41..17af520df 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -33,6 +33,10 @@ your yours yourself yourselves STOPWORDS = set(w for w in STOPWORDS.split() if w) class English(Language): + def __init__(self, **kwargs): + kwargs['lang'] = 'en' + super(English, self).__init__(**kwargs) + @staticmethod def is_stop(string): return 1 if string.lower() in STOPWORDS else 0 diff --git a/spacy/language.py b/spacy/language.py index 83b91cdca..7a96e12ea 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -20,7 +20,7 @@ from .syntax.ner import BiluoPushDown from .syntax.arc_eager import ArcEager from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD -from .util import default_package +from .util import get_package class Language(object): @@ -137,48 +137,100 @@ class Language(object): @classmethod def default_vocab(cls, package=None, get_lex_attr=None): if package is None: - package = default_package() + package = get_package() if get_lex_attr is None: get_lex_attr = cls.default_lex_attrs() return Vocab.from_package(package, get_lex_attr=get_lex_attr) @classmethod def default_parser(cls, package, vocab): - data_dir = package.dir_path('data', 'deps', require=False) + data_dir = package.dir_path('deps', require=False) if data_dir and path.exists(data_dir): return Parser.from_dir(data_dir, vocab.strings, ArcEager) @classmethod def default_entity(cls, package, vocab): - data_dir = package.dir_path('data', 'ner', require=False) + data_dir = package.dir_path('ner', require=False) if data_dir and path.exists(data_dir): return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown) - def __init__(self, package=None, vocab=None, tokenizer=None, tagger=None, - parser=None, entity=None, matcher=None, serializer=None, - load_vectors=True): + def __init__(self, **kwargs): + """ + a model can be specified: + + 1) by a path to the model directory (DEPRECATED) + - Language(data_dir='path/to/data') + + 2) by a language identifier (and optionally a package root dir) + - Language(lang='en') + - Language(lang='en', data_dir='spacy/data') + + 3) by a model name/version (and optionally a package root dir) + - Language(model='en_default') + - Language(model='en_default', version='1.0.0') + - Language(model='en_default', version='1.0.0', data_dir='spacy/data') + """ + + data_dir = kwargs.pop('data_dir', None) + + lang = kwargs.pop('lang', None) + model = kwargs.pop('model', None) + version = kwargs.pop('version', None) + + vocab = kwargs.pop('vocab', None) + tokenizer = kwargs.pop('tokenizer', None) + tagger = kwargs.pop('tagger', None) + parser = kwargs.pop('parser', None) + entity = kwargs.pop('entity', None) + matcher = kwargs.pop('matcher', None) + serializer = kwargs.pop('serializer', None) + + load_vectors = kwargs.pop('load_vectors', True) + + # support non-package data dirs + if data_dir and path.exists(path.join(data_dir, 'vocab')): + class Package(object): + def __init__(self, root): + self.root = root + + def has_file(self, *path_parts): + return path.exists(path.join(self.root, *path_parts)) + + def file_path(self, *path_parts, **kwargs): + return path.join(self.root, *path_parts) + + def dir_path(self, *path_parts, **kwargs): + return path.join(self.root, *path_parts) + + def load_utf8(self, func, *path_parts, **kwargs): + with io.open(self.file_path(path.join(*path_parts)), + mode='r', encoding='utf8') as f: + return func(f) + + warn("using non-package data_dir", DeprecationWarning) + package = Package(data_dir) + else: + if model is None: + model = '%s_default' % (lang or 'en') + version = None + print(model, version) + package = get_package(name=model, version=version, + data_path=data_dir) + if load_vectors is not True: warn("load_vectors is deprecated", DeprecationWarning) - if package in (None, True): - package = default_package() if vocab in (None, True): - vocab = self.default_vocab(package) + self.vocab = self.default_vocab(package) if tokenizer in (None, True): - tokenizer = Tokenizer.from_package(package, vocab) + self.tokenizer = Tokenizer.from_package(package, self.vocab) if tagger in (None, True): - tagger = Tagger.from_package(package, vocab) + self.tagger = Tagger.from_package(package, self.vocab) if entity in (None, True): - entity = self.default_entity(package, vocab) + self.entity = self.default_entity(package, self.vocab) if parser in (None, True): - parser = self.default_parser(package, vocab) + self.parser = self.default_parser(package, self.vocab) if matcher in (None, True): - matcher = Matcher.from_package(package, vocab) - self.vocab = vocab - self.tokenizer = tokenizer - self.tagger = tagger - self.parser = parser - self.entity = entity - self.matcher = matcher + self.matcher = Matcher.from_package(package, self.vocab) def __reduce__(self): return (self.__class__, diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index d7fdcf76b..c5b9c1c50 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -17,14 +17,14 @@ class Lemmatizer(object): exc = {} for pos in ['adj', 'noun', 'verb']: index[pos] = package.load_utf8(read_index, - 'data', 'wordnet', 'index.%s' % pos, + 'wordnet', 'index.%s' % pos, default=set()) # TODO: really optional? exc[pos] = package.load_utf8(read_exc, - 'data', 'wordnet', '%s.exc' % pos, + 'wordnet', '%s.exc' % pos, default={}) # TODO: really optional? rules = package.load_utf8(json.load, - 'data', 'vocab', 'lemma_rules.json', + 'vocab', 'lemma_rules.json', default={}) # TODO: really optional? return cls(index, exc, rules) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 4319d593b..4d36b7742 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -171,7 +171,7 @@ cdef class Matcher: @classmethod def from_package(cls, package, Vocab vocab): patterns = package.load_utf8(json.load, - 'data', 'vocab', 'gazetteer.json', + 'vocab', 'gazetteer.json', default={}) # TODO: really optional? return cls(vocab, patterns) diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 1c345c6e8..2c05b4a84 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -148,15 +148,16 @@ cdef class Tagger: @classmethod def from_package(cls, package, vocab): # TODO: templates.json deprecated? not present in latest package - templates = package.load_utf8(json.load, - 'data', 'pos', 'templates.json', - default=cls.default_templates()) + templates = cls.default_templates() + # templates = package.load_utf8(json.load, + # 'pos', 'templates.json', + # default=cls.default_templates()) model = TaggerModel(vocab.morphology.n_tags, ConjunctionExtracter(N_CONTEXT_FIELDS, templates)) - if package.has_file('data', 'pos', 'model'): # TODO: really optional? - model.load(package.file_path('data', 'pos', 'model')) + if package.has_file('pos', 'model'): # TODO: really optional? + model.load(package.file_path('pos', 'model')) return cls(vocab, model) diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index 708594299..6950f010f 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -20,14 +20,14 @@ def lemmatizer(package): def test_read_index(package): - index = package.load_utf8(read_index, 'data', 'wordnet', 'index.noun') + index = package.load_utf8(read_index, 'wordnet', 'index.noun') assert 'man' in index assert 'plantes' not in index assert 'plant' in index def test_read_exc(package): - exc = package.load_utf8(read_exc, 'data', 'wordnet', 'verb.exc') + exc = package.load_utf8(read_exc, 'wordnet', 'verb.exc') assert exc['was'] == ('be',) diff --git a/spacy/util.py b/spacy/util.py index 5592e64eb..69e3ba237 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -8,16 +8,20 @@ from sputnik import Sputnik from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE -def default_package(): - if os.environ.get('SPACY_DATA'): - data_path = os.environ.get('SPACY_DATA') - else: - data_path = os.path.abspath( - os.path.join(os.path.dirname(__file__), 'data')) +def get_package(name=None, version=None, data_path=None): + if data_path is None: + if os.environ.get('SPACY_DATA'): + data_path = os.environ.get('SPACY_DATA') + else: + data_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data')) - sputnik = Sputnik('spacy', '0.99.0') # TODO: retrieve version + sputnik = Sputnik('spacy', '0.100.0') # TODO: retrieve version pool = sputnik.pool(data_path) - return pool.get('en_default') + + if version: + name += ' ==%s' % version + return pool.get(name) def normalize_slice(length, start, stop, step=None): @@ -45,10 +49,10 @@ def utf8open(loc, mode='r'): def read_lang_data(package): - tokenization = package.load_utf8(json.load, 'data', 'tokenizer', 'specials.json') - prefix = package.load_utf8(read_prefix, 'data', 'tokenizer', 'prefix.txt') - suffix = package.load_utf8(read_suffix, 'data', 'tokenizer', 'suffix.txt') - infix = package.load_utf8(read_infix, 'data', 'tokenizer', 'infix.txt') + tokenization = package.load_utf8(json.load, 'tokenizer', 'specials.json') + prefix = package.load_utf8(read_prefix, 'tokenizer', 'prefix.txt') + suffix = package.load_utf8(read_suffix, 'tokenizer', 'suffix.txt') + infix = package.load_utf8(read_infix, 'tokenizer', 'infix.txt') return tokenization, prefix, suffix, infix diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ac083d9bc..bb0ae6173 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -49,24 +49,24 @@ cdef class Vocab: @classmethod def from_package(cls, package, get_lex_attr=None): tag_map = package.load_utf8(json.load, - 'data', 'vocab', 'tag_map.json') + 'vocab', 'tag_map.json') lemmatizer = Lemmatizer.from_package(package) serializer_freqs = package.load_utf8(json.load, - 'data', 'vocab', 'serializer.json', + 'vocab', 'serializer.json', require=False) # TODO: really optional? cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map, lemmatizer=lemmatizer, serializer_freqs=serializer_freqs) - if package.has_file('data', 'vocab', 'strings.json'): # TODO: really optional? - package.load_utf8(self.strings.load, 'data', 'vocab', 'strings.json') - self.load_lexemes(package.file_path('data', 'vocab', 'lexemes.bin')) + if package.has_file('vocab', 'strings.json'): # TODO: really optional? + package.load_utf8(self.strings.load, 'vocab', 'strings.json') + self.load_lexemes(package.file_path('vocab', 'lexemes.bin')) - if package.has_file('data', 'vocab', 'vec.bin'): # TODO: really optional? + if package.has_file('vocab', 'vec.bin'): # TODO: really optional? self.vectors_length = self.load_vectors_from_bin_loc( - package.file_path('data', 'vocab', 'vec.bin')) + package.file_path('vocab', 'vec.bin')) return self