mirror of https://github.com/explosion/spaCy.git
Merge load_lang_class and get_lang_class
This commit is contained in:
parent
36bebe7164
commit
b462076d80
|
@ -16,7 +16,7 @@ def load(name, **overrides):
|
||||||
meta = util.parse_package_meta(model_path)
|
meta = util.parse_package_meta(model_path)
|
||||||
if 'lang' not in meta:
|
if 'lang' not in meta:
|
||||||
raise IOError('No language setting found in model meta.')
|
raise IOError('No language setting found in model meta.')
|
||||||
cls = util.load_lang_class(meta['lang'])
|
cls = util.get_lang_class(meta['lang'])
|
||||||
overrides['meta'] = meta
|
overrides['meta'] = meta
|
||||||
overrides['path'] = model_path
|
overrides['path'] = model_path
|
||||||
return cls(**overrides)
|
return cls(**overrides)
|
||||||
|
|
|
@ -18,67 +18,67 @@ _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
|
||||||
|
|
||||||
@pytest.fixture(params=_languages)
|
@pytest.fixture(params=_languages)
|
||||||
def tokenizer(request):
|
def tokenizer(request):
|
||||||
lang = util.load_lang_class(request.param)
|
lang = util.get_lang_class(request.param)
|
||||||
return lang.Defaults.create_tokenizer()
|
return lang.Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def en_tokenizer():
|
def en_tokenizer():
|
||||||
return util.load_lang_class('en').Defaults.create_tokenizer()
|
return util.get_lang_class('en').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def en_vocab():
|
def en_vocab():
|
||||||
return util.load_lang_class('en').Defaults.create_vocab()
|
return util.get_lang_class('en').Defaults.create_vocab()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def en_parser():
|
def en_parser():
|
||||||
return util.load_lang_class('en').Defaults.create_parser()
|
return util.get_lang_class('en').Defaults.create_parser()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def es_tokenizer():
|
def es_tokenizer():
|
||||||
return util.load_lang_class('es').Defaults.create_tokenizer()
|
return util.get_lang_class('es').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def de_tokenizer():
|
def de_tokenizer():
|
||||||
return util.load_lang_class('de').Defaults.create_tokenizer()
|
return util.get_lang_class('de').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='module')
|
@pytest.fixture(scope='module')
|
||||||
def fr_tokenizer():
|
def fr_tokenizer():
|
||||||
return util.load_lang_class('fr').Defaults.create_tokenizer()
|
return util.get_lang_class('fr').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def hu_tokenizer():
|
def hu_tokenizer():
|
||||||
return util.load_lang_class('hu').Defaults.create_tokenizer()
|
return util.get_lang_class('hu').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def fi_tokenizer():
|
def fi_tokenizer():
|
||||||
return util.load_lang_class('fi').Defaults.create_tokenizer()
|
return util.get_lang_class('fi').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def sv_tokenizer():
|
def sv_tokenizer():
|
||||||
return util.load_lang_class('sv').Defaults.create_tokenizer()
|
return util.get_lang_class('sv').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def bn_tokenizer():
|
def bn_tokenizer():
|
||||||
return util.load_lang_class('bn').Defaults.create_tokenizer()
|
return util.get_lang_class('bn').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def he_tokenizer():
|
def he_tokenizer():
|
||||||
return util.load_lang_class('he').Defaults.create_tokenizer()
|
return util.get_lang_class('he').Defaults.create_tokenizer()
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def nb_tokenizer():
|
def nb_tokenizer():
|
||||||
return util.load_lang_class('nb').Defaults.create_tokenizer()
|
return util.get_lang_class('nb').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -88,12 +88,12 @@ def stringstore():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def en_entityrecognizer():
|
def en_entityrecognizer():
|
||||||
return util.load_lang_class('en').Defaults.create_entity()
|
return util.get_lang_class('en').Defaults.create_entity()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def lemmatizer():
|
def lemmatizer():
|
||||||
return util.load_lang_class('en').Defaults.create_lemmatizer()
|
return util.get_lang_class('en').Defaults.create_lemmatizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|
|
@ -17,30 +17,30 @@ LANGUAGES = {}
|
||||||
_data_path = Path(__file__).parent / 'data'
|
_data_path = Path(__file__).parent / 'data'
|
||||||
|
|
||||||
|
|
||||||
def set_lang_class(name, cls):
|
def get_lang_class(lang):
|
||||||
|
"""Import and load a Language class.
|
||||||
|
|
||||||
|
lang (unicode): Two-letter language code, e.g. 'en'.
|
||||||
|
RETURNS (Language): Language class.
|
||||||
|
"""
|
||||||
global LANGUAGES
|
global LANGUAGES
|
||||||
LANGUAGES[name] = cls
|
if not lang in LANGUAGES:
|
||||||
|
try:
|
||||||
|
module = importlib.import_module('.lang.%s' % lang, 'spacy')
|
||||||
def get_lang_class(name):
|
except ImportError:
|
||||||
if name in LANGUAGES:
|
raise ImportError("Can't import language %s from spacy.lang." %lang)
|
||||||
return LANGUAGES[name]
|
LANGUAGES[lang] = getattr(module, module.__all__[0])
|
||||||
lang = re.split('[^a-zA-Z0-9]', name, 1)[0]
|
|
||||||
if lang not in LANGUAGES:
|
|
||||||
raise RuntimeError('Language not supported: %s' % name)
|
|
||||||
return LANGUAGES[lang]
|
return LANGUAGES[lang]
|
||||||
|
|
||||||
|
|
||||||
def load_lang_class(lang):
|
def set_lang_class(name, cls):
|
||||||
"""Import and load a Language class.
|
"""Set a custom Language class name that can be loaded via get_lang_class.
|
||||||
|
|
||||||
Args:
|
name (unicode): Name of Language class.
|
||||||
lang (unicode): Two-letter language code, e.g. 'en'.
|
cls (Language): Language class.
|
||||||
Returns:
|
|
||||||
Language: Language class.
|
|
||||||
"""
|
"""
|
||||||
module = importlib.import_module('.lang.%s' % lang, 'spacy')
|
global LANGUAGES
|
||||||
return getattr(module, module.__all__[0])
|
LANGUAGES[name] = cls
|
||||||
|
|
||||||
|
|
||||||
def get_data_path(require_exists=True):
|
def get_data_path(require_exists=True):
|
||||||
|
|
|
@ -49,7 +49,7 @@ p
|
||||||
+cell unicode or #[code Path]
|
+cell unicode or #[code Path]
|
||||||
+cell Path to new data directory.
|
+cell Path to new data directory.
|
||||||
|
|
||||||
+h(2, "load_lang_class") load_lang_class
|
+h(2, "get_lang_class") get_lang_class
|
||||||
+tag function
|
+tag function
|
||||||
|
|
||||||
p
|
p
|
||||||
|
@ -59,7 +59,7 @@ p
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
for lang_id in ['en', 'de']:
|
for lang_id in ['en', 'de']:
|
||||||
lang_class = util.load_lang_class(lang_id)
|
lang_class = util.get_lang_class(lang_id)
|
||||||
lang = lang_class()
|
lang = lang_class()
|
||||||
tokenizer = lang.Defaults.create_tokenizer()
|
tokenizer = lang.Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
|
@ -80,7 +80,7 @@ p
|
||||||
| compute. As of spaCy v2.0, #[code Language] classes are not imported on
|
| compute. As of spaCy v2.0, #[code Language] classes are not imported on
|
||||||
| initialisation and are only loaded when you import them directly, or load
|
| initialisation and are only loaded when you import them directly, or load
|
||||||
| a model that requires a language to be loaded. To lazy-load languages in
|
| a model that requires a language to be loaded. To lazy-load languages in
|
||||||
| your application, you can use the #[code util.load_lang_class()] helper
|
| your application, you can use the #[code util.get_lang_class()] helper
|
||||||
| function with the two-letter language code as its argument.
|
| function with the two-letter language code as its argument.
|
||||||
|
|
||||||
+h(2, "language-data") Adding language data
|
+h(2, "language-data") Adding language data
|
||||||
|
@ -486,7 +486,7 @@ p
|
||||||
| #[+src(gh("spaCy", "spacy/tests/lang")) tests/lang] in a directory named
|
| #[+src(gh("spaCy", "spacy/tests/lang")) tests/lang] in a directory named
|
||||||
| after the language ID. You'll also need to create a fixture for your
|
| after the language ID. You'll also need to create a fixture for your
|
||||||
| tokenizer in the #[+src(gh("spaCy", "spacy/tests/conftest.py")) conftest.py].
|
| tokenizer in the #[+src(gh("spaCy", "spacy/tests/conftest.py")) conftest.py].
|
||||||
| Always use the #[code load_lang_class()] helper function within the fixture,
|
| Always use the #[code get_lang_class()] helper function within the fixture,
|
||||||
| instead of importing the class at the top of the file. This will load the
|
| instead of importing the class at the top of the file. This will load the
|
||||||
| language data only when it's needed. (Otherwise, #[em all data] would be
|
| language data only when it's needed. (Otherwise, #[em all data] would be
|
||||||
| loaded every time you run a test.)
|
| loaded every time you run a test.)
|
||||||
|
@ -494,7 +494,7 @@ p
|
||||||
+code.
|
+code.
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def en_tokenizer():
|
def en_tokenizer():
|
||||||
return util.load_lang_class('en').Defaults.create_tokenizer()
|
return util.get_lang_class('en').Defaults.create_tokenizer()
|
||||||
|
|
||||||
p
|
p
|
||||||
| When adding test cases, always
|
| When adding test cases, always
|
||||||
|
|
Loading…
Reference in New Issue