From 931c07a609d169ae7c3de0be2a4d94d6c7e9a7ac Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 4 Mar 2016 11:09:06 +0100 Subject: [PATCH 1/4] initial proposal for separate vector package --- spacy/__init__.py | 5 +++-- spacy/language.py | 10 ++++++---- spacy/vocab.pyx | 7 +++++-- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 556027a42..191d5970c 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -2,6 +2,7 @@ from . import util from .en import English -def load(name, via=None): +def load(name, via=None, vectors_name=None): package = util.get_package_by_name(name, via=via) - return English(package=package) + vectors_package = util.get_package_by_name(vectors_name, via=via) + return English(package=package, vectors_package=vectors_package) diff --git a/spacy/language.py b/spacy/language.py index ae8aa4560..157f7d040 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -153,7 +153,7 @@ class Language(object): return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}} @classmethod - def default_vocab(cls, package, get_lex_attr=None): + def default_vocab(cls, package, get_lex_attr=None, vectors_package=None): if get_lex_attr is None: if package.has_file('vocab', 'oov_prob'): with package.open(('vocab', 'oov_prob')) as file_: @@ -162,7 +162,8 @@ class Language(object): else: get_lex_attr = cls.default_lex_attrs() if hasattr(package, 'dir_path'): - return Vocab.from_package(package, get_lex_attr=get_lex_attr) + return Vocab.from_package(package, get_lex_attr=get_lex_attr, + vectors_package=vectors_package) else: return Vocab.load(package, get_lex_attr) @@ -198,7 +199,8 @@ class Language(object): matcher=None, serializer=None, load_vectors=True, - package=None): + package=None, + vectors_package=None): """ a model can be specified: @@ -228,7 +230,7 @@ class Language(object): warn("load_vectors is deprecated", DeprecationWarning) if vocab in (None, True): - vocab = self.default_vocab(package) + vocab = self.default_vocab(package, vectors_package=vectors_package) self.vocab = vocab if tokenizer in (None, True): tokenizer = Tokenizer.from_package(package, self.vocab) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index a0a07f305..de4909f30 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -52,7 +52,7 @@ cdef class Vocab: return cls.from_package(get_package(data_dir), get_lex_attr=get_lex_attr) @classmethod - def from_package(cls, package, get_lex_attr=None): + def from_package(cls, package, get_lex_attr=None, vectors_package=None): tag_map = package.load_json(('vocab', 'tag_map.json'), default={}) lemmatizer = Lemmatizer.from_package(package) @@ -66,7 +66,10 @@ cdef class Vocab: self.strings.load(file_) self.load_lexemes(package.file_path('vocab', 'lexemes.bin')) - if package.has_file('vocab', 'vec.bin'): + if vectors_package and vectors_package.has_file('vocab', 'vec.bin'): + self.vectors_length = self.load_vectors_from_bin_loc( + vectors_package.file_path('vocab', 'vec.bin')) + elif package.has_file('vocab', 'vec.bin'): self.vectors_length = self.load_vectors_from_bin_loc( package.file_path('vocab', 'vec.bin')) return self From aa4d964c1431b4ff2e633e1e55940733089c1a88 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Sat, 5 Mar 2016 17:51:32 +0100 Subject: [PATCH 2/4] cleanup api --- spacy/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 191d5970c..ef9fb18cf 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -2,7 +2,7 @@ from . import util from .en import English -def load(name, via=None, vectors_name=None): - package = util.get_package_by_name(name, via=via) - vectors_package = util.get_package_by_name(vectors_name, via=via) - return English(package=package, vectors_package=vectors_package) +def load(name, vectors=None, via=None): + return English( + package=util.get_package_by_name(name, via=via), + vectors_package=util.get_package_by_name(vectors_name, via=via)) From eb7ae61b1c6cafadc55743cfc733c73c73c0fa3f Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Tue, 8 Mar 2016 12:59:18 +0100 Subject: [PATCH 3/4] cleanup api --- spacy/__init__.py | 2 +- spacy/util.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index ef9fb18cf..70e72b7a1 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -5,4 +5,4 @@ from .en import English def load(name, vectors=None, via=None): return English( package=util.get_package_by_name(name, via=via), - vectors_package=util.get_package_by_name(vectors_name, via=via)) + vectors_package=util.get_package_by_name(vectors, via=via)) diff --git a/spacy/util.py b/spacy/util.py index 7a5a1aa0c..bcc55c656 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -25,9 +25,9 @@ def get_package_by_name(name=None, via=None): return sputnik.package(about.__title__, about.__version__, name or about.__default_model__, data_path=via) except PackageNotFoundException as e: - raise RuntimeError("Model not installed. Please run 'python -m " + raise RuntimeError("Model %s not installed. Please run 'python -m " "spacy.en.download' to install latest compatible " - "model.") + "model." % name) except CompatiblePackageNotFoundException as e: raise RuntimeError("Installed model is not compatible with spaCy " "version. Please run 'python -m spacy.en.download " From 5b3b3ebc8e33ef73f2a36d5bfcd8ac581e50c86f Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Tue, 8 Mar 2016 15:30:17 +0100 Subject: [PATCH 4/4] upgrade to latest sputnik --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8e0af3cc3..09dd74496 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,4 @@ plac six ujson cloudpickle -sputnik>=0.9.0,<0.10.0 +sputnik>=0.9.2,<0.10.0 diff --git a/setup.py b/setup.py index e646cc0f8..176434151 100644 --- a/setup.py +++ b/setup.py @@ -176,7 +176,7 @@ def setup_package(): ext_modules=ext_modules, install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.32.0', 'preshed>=0.46.1,<0.47', 'thinc>=5.0.0,<5.1.0', 'plac', 'six', - 'ujson', 'cloudpickle', 'sputnik>=0.9.0,<0.10.0'], + 'ujson', 'cloudpickle', 'sputnik>=0.9.2,<0.10.0'], cmdclass = { 'build_ext': build_ext_subclass}, )