diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 85180b1e4..5bf685d44 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -23,7 +23,7 @@ class RussianLemmatizer(Lemmatizer): overwrite: bool = False, scorer: Optional[Callable] = lemmatizer_score, ) -> None: - if mode == "pymorphy2": + if mode in {"pymorphy2", "pymorphy2_lookup"}: try: from pymorphy2 import MorphAnalyzer except ImportError: diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index a8bc56057..d4f8cc9e5 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -18,7 +18,7 @@ class UkrainianLemmatizer(RussianLemmatizer): overwrite: bool = False, scorer: Optional[Callable] = lemmatizer_score, ) -> None: - if mode == "pymorphy2": + if mode in {"pymorphy2", "pymorphy2_lookup"}: try: from pymorphy2 import MorphAnalyzer except ImportError: diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 742bfcc6a..394ef00d3 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -343,6 +343,14 @@ def ru_lemmatizer(): return get_lang_class("ru")().add_pipe("lemmatizer") +@pytest.fixture +def ru_lookup_lemmatizer(): + pytest.importorskip("pymorphy2") + return get_lang_class("ru")().add_pipe( + "lemmatizer", config={"mode": "pymorphy2_lookup"} + ) + + @pytest.fixture(scope="session") def sa_tokenizer(): return get_lang_class("sa")().tokenizer @@ -422,6 +430,15 @@ def uk_lemmatizer(): return get_lang_class("uk")().add_pipe("lemmatizer") +@pytest.fixture +def uk_lookup_lemmatizer(): + pytest.importorskip("pymorphy2") + pytest.importorskip("pymorphy2_dicts_uk") + return get_lang_class("uk")().add_pipe( + "lemmatizer", config={"mode": "pymorphy2_lookup"} + ) + + @pytest.fixture(scope="session") def ur_tokenizer(): return get_lang_class("ur")().tokenizer diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py index 9ca7f441b..e82fd4f8c 100644 --- a/spacy/tests/lang/ru/test_lemmatizer.py +++ b/spacy/tests/lang/ru/test_lemmatizer.py @@ -78,3 +78,17 @@ def test_ru_lemmatizer_punct(ru_lemmatizer): assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"'] doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"]) assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"'] + + +def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer): + words = ["мама", "мыла", "раму"] + pos = ["NOUN", "VERB", "NOUN"] + morphs = [ + "Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing", + "Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act", + "Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing", + ] + doc = Doc(ru_lookup_lemmatizer.vocab, words=words, pos=pos, morphs=morphs) + doc = ru_lookup_lemmatizer(doc) + lemmas = [token.lemma_ for token in doc] + assert lemmas == ["мама", "мыла", "раму"] diff --git a/spacy/tests/lang/uk/test_lemmatizer.py b/spacy/tests/lang/uk/test_lemmatizer.py index 57dd4198a..788744aa1 100644 --- a/spacy/tests/lang/uk/test_lemmatizer.py +++ b/spacy/tests/lang/uk/test_lemmatizer.py @@ -9,3 +9,11 @@ def test_uk_lemmatizer(uk_lemmatizer): """Check that the default uk lemmatizer runs.""" doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"]) uk_lemmatizer(doc) + assert [token.lemma for token in doc] + + +def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer): + """Check that the lookup uk lemmatizer runs.""" + doc = Doc(uk_lookup_lemmatizer.vocab, words=["a", "b", "c"]) + uk_lookup_lemmatizer(doc) + assert [token.lemma for token in doc]