From da7520a83c6ec6ec22f74bcc265b57620f3b64d8 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 17 Feb 2022 11:35:34 +0100 Subject: [PATCH] Delay loading of mecab in Korean tokenizer (#10295) * Delay loading of mecab in Korean tokenizer Delay loading of mecab until the tokenizer is called the first time so that it's possible to initialize a blank `ko` pipeline without having mecab installed, e.g. for use with `spacy init vectors`. * Move mecab import back to __init__ Move mecab import back to __init__ to warn users at the same point as before for missing python dependencies. --- spacy/lang/ko/__init__.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index 05fc67e79..eb3c2e1f5 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -31,15 +31,24 @@ def create_tokenizer(): class KoreanTokenizer(DummyTokenizer): def __init__(self, vocab: Vocab): self.vocab = vocab - MeCab = try_mecab_import() # type: ignore[func-returns-value] - self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") + self._mecab = try_mecab_import() # type: ignore[func-returns-value] + self._mecab_tokenizer = None + + @property + def mecab_tokenizer(self): + # This is a property so that initializing a pipeline with blank:ko is + # possible without actually requiring mecab-ko, e.g. to run + # `spacy init vectors ko` for a pipeline that will have a different + # tokenizer in the end. The languages need to match for the vectors + # to be imported and there's no way to pass a custom config to + # `init vectors`. + if self._mecab_tokenizer is None: + self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]") + return self._mecab_tokenizer def __reduce__(self): return KoreanTokenizer, (self.vocab,) - def __del__(self): - self.mecab_tokenizer.__del__() - def __call__(self, text: str) -> Doc: dtokens = list(self.detailed_tokens(text)) surfaces = [dt["surface"] for dt in dtokens] @@ -90,7 +99,8 @@ def try_mecab_import() -> None: return MeCab except ImportError: raise ImportError( - "Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), " + "The Korean tokenizer (\"spacy.ko.KoreanTokenizer\") requires " + "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), " "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), " "and [natto-py](https://github.com/buruzaemon/natto-py)" ) from None