From b8f63071eb1a8a1523ca91819485a350afd83c14 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 25 Mar 2016 18:54:45 +0100 Subject: [PATCH] add lang registration facility --- spacy/__init__.py | 21 ++++++++++++--------- spacy/about.py | 14 +++----------- spacy/download.py | 6 +++--- spacy/tokenizer.pyx | 3 +-- spacy/util.py | 32 ++++++++++++++++++++++++-------- spacy/vocab.pyx | 1 - 6 files changed, 43 insertions(+), 34 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index b09ee3491..f47926a63 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,13 +1,16 @@ from . import util -from .about import __models__ -import importlib + +from .en import English +from .de import German +from . import util + + +util.register_lang(English.lang, English) +util.register_lang(German.lang, German) def load(name, vectors=None, via=None): - if name not in __models__: - raise Exception('Model %s not found.' % name) - - mod = importlib.import_module('.%s' % __models__[name]['module'], 'spacy') - return getattr(mod, __models__[name]['class'])( - package=util.get_package_by_name(name, via=via), - vectors_package=util.get_package_by_name(vectors, via=via)) + package = util.get_package_by_name(name, via=via) + vectors_package = util.get_package_by_name(vectors, via=via) + cls = util.get_lang(name) + return cls(package=package, vectors_package=vectors_package) diff --git a/spacy/about.py b/spacy/about.py index eed7c3f81..7f889cad8 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -11,15 +11,7 @@ __author__ = 'Matthew Honnibal' __email__ = 'matt@spacy.io' __license__ = 'MIT' __models__ = { - 'en': { - 'module': 'en', - 'class': 'English', - 'package': 'en>=1.0.0,<1.1.0', - }, - 'de': { - 'module': 'de', - 'class': 'German', - 'package': 'de>=1.0.0,<1.1.0', - }, + 'en': 'en>=1.0.0,<1.1.0', + 'de': 'de>=1.0.0,<1.1.0', } -__default_model__ = 'en' +__default_lang__ = 'en' diff --git a/spacy/download.py b/spacy/download.py index 537c06872..f7fc798ae 100644 --- a/spacy/download.py +++ b/spacy/download.py @@ -14,17 +14,17 @@ def download(lang, force=False): sputnik.purge(about.__title__, about.__version__) try: - sputnik.package(about.__title__, about.__version__, about.__models__[lang]['package']) + sputnik.package(about.__title__, about.__version__, about.__models__[lang]) print("Model already installed. Please run 'python -m " "spacy.%s.download --force' to reinstall." % lang, file=sys.stderr) sys.exit(1) except (PackageNotFoundException, CompatiblePackageNotFoundException): pass - package = sputnik.install(about.__title__, about.__version__, about.__models__[lang]['package']) + package = sputnik.install(about.__title__, about.__version__, about.__models__[lang]) try: - sputnik.package(about.__title__, about.__version__, about.__models__[lang]['package']) + sputnik.package(about.__title__, about.__version__, about.__models__[lang]) except (PackageNotFoundException, CompatiblePackageNotFoundException): print("Model failed to install. Please run 'python -m " "spacy.%s.download --force'." % lang, file=sys.stderr) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index f8613fce8..44d627505 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -16,8 +16,7 @@ cimport cython from . import util from .tokens.doc cimport Doc -from .util import read_lang_data -from .util import get_package +from .util import read_lang_data, get_package cdef class Tokenizer: diff --git a/spacy/util.py b/spacy/util.py index 37d3b7bab..4eda2d0e4 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -14,6 +14,21 @@ from . import about from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE +LANGUAGES = {} + + +def register_lang(name, cls): + global LANGUAGES + LANGUAGES[name] = cls + + +def get_lang(name): + lang = re.split('[^a-zA-Z0-9_]', name, 1)[0] + if lang not in LANGUAGES: + raise RuntimeError('Language not supported: %s' % lang) + return LANGUAGES[lang] + + def get_package(data_dir): if not isinstance(data_dir, six.string_types): raise RuntimeError('data_dir must be a string') @@ -21,19 +36,20 @@ def get_package(data_dir): def get_package_by_name(name=None, via=None): + package_name = name or about.__models__[about.__default_lang__] + lang = get_lang(package_name) try: return sputnik.package(about.__title__, about.__version__, - name or about.__models__[about.__default_model__]['package'], - data_path=via) + package_name, data_path=via) except PackageNotFoundException as e: - raise RuntimeError("Model %s not installed. Please run 'python -m " - "spacy.%s.download' to install latest compatible " - "model." % (name, about.__models__[name]['module'])) + raise RuntimeError("Model '%s' not installed. Please run 'python -m " + "%s.download' to install latest compatible " + "model." % (name, lang.__module__)) except CompatiblePackageNotFoundException as e: - raise RuntimeError("Installed model %s is not compatible with spaCy " - "version. Please run 'python -m spacy.%s.download " + raise RuntimeError("Installed model is not compatible with spaCy " + "version. Please run 'python -m %s.download " "--force' to install latest compatible model." % - (name, about.__models__[name]['module'])) + (lang.__module__)) def normalize_slice(length, start, stop, step=None): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index f876bfefb..3712a7383 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -25,7 +25,6 @@ from . import attrs from . import symbols from cymem.cymem cimport Address -from . import util from .serialize.packer cimport Packer from .attrs cimport PROB