mirror of https://github.com/explosion/spaCy.git
Modernise lemmatizer tests
This commit is contained in:
parent
33d9cf87f9
commit
affcf1b19d
|
@ -1,34 +1,43 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import os
|
from ...lemmatizer import read_index, read_exc
|
||||||
import io
|
|
||||||
import pickle
|
|
||||||
import pathlib
|
|
||||||
|
|
||||||
from spacy.lemmatizer import Lemmatizer, read_index, read_exc
|
|
||||||
from spacy import util
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]),
|
||||||
def path():
|
("aardwolf", ["aardwolf"]),
|
||||||
if 'SPACY_DATA' in os.environ:
|
("planets", ["planet"]),
|
||||||
return pathlib.Path(os.environ['SPACY_DATA'])
|
("ring", ["ring"]),
|
||||||
else:
|
("axes", ["axis", "axe", "ax"])])
|
||||||
return util.match_best_version('en', None, util.get_data_path())
|
def test_tagger_lemmatizer_noun_lemmas(lemmatizer, text, lemmas):
|
||||||
|
if lemmatizer is None:
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def lemmatizer(path):
|
|
||||||
if path is not None:
|
|
||||||
return Lemmatizer.load(path)
|
|
||||||
else:
|
|
||||||
return None
|
return None
|
||||||
|
assert lemmatizer.noun(text) == set(lemmas)
|
||||||
|
|
||||||
|
|
||||||
def test_read_index(path):
|
def test_tagger_lemmatizer_base_forms(lemmatizer):
|
||||||
|
if lemmatizer is None:
|
||||||
|
return None
|
||||||
|
assert lemmatizer.noun('dive', {'number': 'sing'}) == set(['dive'])
|
||||||
|
assert lemmatizer.noun('dive', {'number': 'plur'}) == set(['diva'])
|
||||||
|
|
||||||
|
|
||||||
|
def test_tagger_lemmatizer_base_form_verb(lemmatizer):
|
||||||
|
if lemmatizer is None:
|
||||||
|
return None
|
||||||
|
assert lemmatizer.verb('saw', {'verbform': 'past'}) == set(['see'])
|
||||||
|
|
||||||
|
|
||||||
|
def test_tagger_lemmatizer_punct(lemmatizer):
|
||||||
|
if lemmatizer is None:
|
||||||
|
return None
|
||||||
|
assert lemmatizer.punct('“') == set(['"'])
|
||||||
|
assert lemmatizer.punct('“') == set(['"'])
|
||||||
|
|
||||||
|
|
||||||
|
def test_tagger_lemmatizer_read_index(path):
|
||||||
if path is not None:
|
if path is not None:
|
||||||
with (path / 'wordnet' / 'index.noun').open() as file_:
|
with (path / 'wordnet' / 'index.noun').open() as file_:
|
||||||
index = read_index(file_)
|
index = read_index(file_)
|
||||||
|
@ -37,67 +46,18 @@ def test_read_index(path):
|
||||||
assert 'plant' in index
|
assert 'plant' in index
|
||||||
|
|
||||||
|
|
||||||
def test_read_exc(path):
|
@pytest.mark.parametrize('text,lemma', [("was", "be")])
|
||||||
|
def test_tagger_lemmatizer_read_exc(path, text, lemma):
|
||||||
if path is not None:
|
if path is not None:
|
||||||
with (path / 'wordnet' / 'verb.exc').open() as file_:
|
with (path / 'wordnet' / 'verb.exc').open() as file_:
|
||||||
exc = read_exc(file_)
|
exc = read_exc(file_)
|
||||||
assert exc['was'] == ('be',)
|
assert exc[text] == (lemma,)
|
||||||
|
|
||||||
|
|
||||||
def test_noun_lemmas(lemmatizer):
|
|
||||||
if lemmatizer is None:
|
|
||||||
return None
|
|
||||||
do = lemmatizer.noun
|
|
||||||
|
|
||||||
assert do('aardwolves') == set(['aardwolf'])
|
|
||||||
assert do('aardwolf') == set(['aardwolf'])
|
|
||||||
assert do('planets') == set(['planet'])
|
|
||||||
assert do('ring') == set(['ring'])
|
|
||||||
assert do('axes') == set(['axis', 'axe', 'ax'])
|
|
||||||
|
|
||||||
|
|
||||||
def test_base_form_dive(lemmatizer):
|
|
||||||
if lemmatizer is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
do = lemmatizer.noun
|
|
||||||
assert do('dive', {'number': 'sing'}) == set(['dive'])
|
|
||||||
assert do('dive', {'number': 'plur'}) == set(['diva'])
|
|
||||||
|
|
||||||
|
|
||||||
def test_base_form_saw(lemmatizer):
|
|
||||||
if lemmatizer is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
do = lemmatizer.verb
|
|
||||||
assert do('saw', {'verbform': 'past'}) == set(['see'])
|
|
||||||
|
|
||||||
|
|
||||||
def test_smart_quotes(lemmatizer):
|
|
||||||
if lemmatizer is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
do = lemmatizer.punct
|
|
||||||
assert do('“') == set(['"'])
|
|
||||||
assert do('“') == set(['"'])
|
|
||||||
|
|
||||||
|
|
||||||
def test_pickle_lemmatizer(lemmatizer):
|
|
||||||
if lemmatizer is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
file_ = io.BytesIO()
|
|
||||||
pickle.dump(lemmatizer, file_)
|
|
||||||
|
|
||||||
file_.seek(0)
|
|
||||||
|
|
||||||
loaded = pickle.load(file_)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models
|
||||||
def test_tagger_lemma_assignment(EN):
|
def test_tagger_lemmatizer_lemma_assignment(EN):
|
||||||
tokens = u'Bananas in pyjamas are geese .'.split(' ')
|
text = "Bananas in pyjamas are geese."
|
||||||
doc = EN.tokenizer.tokens_from_list(tokens)
|
doc = EN.tokenizer(text)
|
||||||
assert all( t.lemma_ == u'' for t in doc )
|
assert all(t.lemma_ == '' for t in doc)
|
||||||
EN.tagger(doc)
|
EN.tagger(doc)
|
||||||
assert all( t.lemma_ != u'' for t in doc )
|
assert all(t.lemma_ != '' for t in doc)
|
||||||
|
|
Loading…
Reference in New Issue