From bc229790ac342c273cb9cd1124477a29732dc90b Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Wed, 13 Jan 2016 19:46:17 +0100 Subject: [PATCH] integrate with sputnik --- requirements.txt | 2 +- setup.py | 2 +- spacy/language.py | 8 +-- spacy/lemmatizer.py | 4 +- spacy/matcher.pyx | 4 +- spacy/tagger.pyx | 4 +- spacy/tests/tagger/test_lemmatizer.py | 2 +- spacy/tokenizer.pyx | 4 +- spacy/util.py | 79 +++++---------------------- spacy/vocab.pyx | 4 +- 10 files changed, 32 insertions(+), 81 deletions(-) diff --git a/requirements.txt b/requirements.txt index 98c0bbf00..b43a48752 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,4 @@ plac six ujson cloudpickle -sputnik>=0.6.4,<0.7.0 +sputnik>=0.7.0,<0.8.0 diff --git a/setup.py b/setup.py index 488885d72..a1e7dc94b 100644 --- a/setup.py +++ b/setup.py @@ -271,7 +271,7 @@ def setup_package(): ext_modules=ext_modules, install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.31', 'preshed>=0.46.1,<0.47', 'thinc>=4.2.0,<4.3.0', 'text_unidecode', 'plac', 'six', - 'ujson', 'cloudpickle', 'sputnik>=0.6.4,<0.7.0'], + 'ujson', 'cloudpickle', 'sputnik>=0.7.0,<0.8.0'], cmdclass = { 'build_ext': build_ext_subclass}, ) diff --git a/spacy/language.py b/spacy/language.py index 1dbbc09b1..fe7cabcd7 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -20,7 +20,7 @@ from .syntax.ner import BiluoPushDown from .syntax.arc_eager import ArcEager from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD -from .util import get_package, Package +from .util import get_package class Language(object): @@ -146,13 +146,13 @@ class Language(object): @classmethod def default_parser(cls, package, vocab): - data_dir = package.dir_path('deps', require=False) + data_dir = package.dir_path('deps') if data_dir and path.exists(data_dir): return Parser.from_dir(data_dir, vocab.strings, ArcEager) @classmethod def default_entity(cls, package, vocab): - data_dir = package.dir_path('ner', require=False) + data_dir = package.dir_path('ner') if data_dir and path.exists(data_dir): return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown) @@ -182,7 +182,7 @@ class Language(object): - Language(model='en_default ==1.0.0') - Language(model='en_default <1.1.0, data_dir='spacy/data') """ - package = Package(data_dir) + package = get_package(model, data_path=data_dir) if load_vectors is not True: warn("load_vectors is deprecated", DeprecationWarning) if vocab in (None, True): diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 48f23b4b4..556de3659 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -8,13 +8,13 @@ except ImportError: import json from .parts_of_speech import NOUN, VERB, ADJ, PUNCT -from .util import Package +from .util import get_package class Lemmatizer(object): @classmethod def load(cls, pkg_or_str_or_file): - pkg = Package.create_or_return(pkg_or_str_or_file) + pkg = get_package(pkg_or_str_or_file) index = {} exc = {} for pos in ['adj', 'noun', 'verb']: diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 777cdfbf3..2b7364487 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -21,7 +21,7 @@ from .tokens.doc cimport Doc from .vocab cimport Vocab from .attrs import FLAG61 as U_ENT -from .util import Package +from .util import get_package from .attrs import FLAG60 as B2_ENT from .attrs import FLAG59 as B3_ENT @@ -171,7 +171,7 @@ cdef class Matcher: @classmethod def load(cls, pkg_or_str_or_file, Vocab vocab): - package = Package.create_or_return(pkg_or_str_or_file) + package = get_package(pkg_or_str_or_file) patterns = package.load_json(('vocab', 'gazetteer.json')) return cls(vocab, patterns) diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index decf918d8..a3f8797e2 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -16,7 +16,7 @@ from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE from .attrs cimport * -from .util import Package +from .util import get_package cpdef enum: @@ -149,7 +149,7 @@ cdef class Tagger: @classmethod def load(cls, pkg_or_str_or_file, vocab): - pkg = Package.create_or_return(pkg_or_str_or_file) + pkg = get_package(pkg_or_str_or_file) # TODO: templates.json deprecated? not present in latest package templates = cls.default_templates() # templates = package.load_utf8(json.load, diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index a73c6dd4b..3de30693c 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -5,7 +5,7 @@ import io import pickle from spacy.lemmatizer import Lemmatizer, read_index, read_exc -from spacy.util import get_package, Package +from spacy.util import get_package import pytest diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 3c1f1e1ab..49e8a06ef 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -17,7 +17,7 @@ cimport cython from . import util from .tokens.doc cimport Doc from .util import read_lang_data -from .util import Package +from .util import get_package cdef class Tokenizer: @@ -43,7 +43,7 @@ cdef class Tokenizer: @classmethod def load(cls, pkg_or_str_or_file, Vocab vocab): - pkg = Package.create_or_return(pkg_or_str_or_file) + pkg = get_package(pkg_or_str_or_file) rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg) prefix_re = re.compile(prefix_re) suffix_re = re.compile(suffix_re) diff --git a/spacy/util.py b/spacy/util.py index 5f148bc01..49bbf3841 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -3,76 +3,27 @@ import io import json import re import os.path -from contextlib import contextmanager -import types +import sputnik +from sputnik.dir_package import DirPackage +from sputnik.package_stub import PackageStub + +from . import about from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE -def local_path(*dirs): - return os.path.abspath(os.path.join(os.path.dirname(__file__), *dirs)) +def get_package(value=None, data_path=None): + if data_path is None: + if isinstance(value, PackageStub): + return value + elif value and os.path.isdir(value): + return DirPackage(value) + elif value is None and data_path is not None: + return DirPackage(data_path) -class Package(object): - @classmethod - def create_or_return(cls, me_or_arg): - return me_or_arg if isinstance(me_or_arg, cls) else cls(me_or_arg) - - def __init__(self, data_path=None, model='en_default-1.0.3'): - if data_path is None: - data_path = local_path('data', model) - self.model = model - self.data_path = data_path - self._root = self.data_path - - def get(self, key): - pass - - def has_file(self, *path_parts): - return os.path.exists(os.path.join(self._root, *path_parts)) - - def file_path(self, *path_parts, **kwargs): - return os.path.join(self._root, *path_parts) - - def dir_path(self, *path_parts, **kwargs): - return os.path.join(self._root, *path_parts) - - def load_json(self, path_parts, default=None): - if not self.has_file(*path_parts): - if _is_error_class(default): - raise default(self.file_path(*path_parts)) - elif isinstance(default, Exception): - raise default - else: - return default - with io.open(self.file_path(os.path.join(*path_parts)), - mode='r', encoding='utf8') as file_: - return json.load(file_) - - @contextmanager - def open(self, path_parts, mode='r', encoding='utf8', default=IOError): - if not self.has_file(*path_parts): - if _is_error_class(default): - raise default(self.file_path(*path_parts)) - elif isinstance(default, Exception): - raise default - else: - yield default - else: - # Enter - file_ = io.open(self.file_path(os.path.join(*path_parts)), - mode=mode, encoding='utf8') - yield file_ - # Exit - file_.close() - - -def _is_error_class(e): - return isinstance(e, types.TypeType) and issubclass(e, Exception) - - -def get_package(name=None, data_path=None): - return Package(data_path) + return sputnik.package('spacy', about.short_version, + value or 'en_default==1.0.4', data_path=data_path) def normalize_slice(length, start, stop, step=None): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index a1d5ee8cc..e09cb48de 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -19,7 +19,7 @@ from .orth cimport word_shape from .typedefs cimport attr_t from .cfile cimport CFile from .lemmatizer import Lemmatizer -from .util import Package +from .util import get_package from . import attrs from . import symbols @@ -49,7 +49,7 @@ cdef class Vocab: ''' @classmethod def load(cls, pkg_or_str_or_file, get_lex_attr=None): - package = Package.create_or_return(pkg_or_str_or_file) + package = get_package(pkg_or_str_or_file) tag_map = package.load_json(('vocab', 'tag_map.json'), default={}) lemmatizer = Lemmatizer.load(package)