diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index ca5aaadb5..5aa48ddc6 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -1,34 +1,43 @@ # coding: utf-8 from __future__ import unicode_literals -import os -import io -import pickle -import pathlib - -from spacy.lemmatizer import Lemmatizer, read_index, read_exc -from spacy import util +from ...lemmatizer import read_index, read_exc import pytest -@pytest.fixture -def path(): - if 'SPACY_DATA' in os.environ: - return pathlib.Path(os.environ['SPACY_DATA']) - else: - return util.match_best_version('en', None, util.get_data_path()) - - -@pytest.fixture -def lemmatizer(path): - if path is not None: - return Lemmatizer.load(path) - else: +@pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]), + ("aardwolf", ["aardwolf"]), + ("planets", ["planet"]), + ("ring", ["ring"]), + ("axes", ["axis", "axe", "ax"])]) +def test_tagger_lemmatizer_noun_lemmas(lemmatizer, text, lemmas): + if lemmatizer is None: return None + assert lemmatizer.noun(text) == set(lemmas) -def test_read_index(path): +def test_tagger_lemmatizer_base_forms(lemmatizer): + if lemmatizer is None: + return None + assert lemmatizer.noun('dive', {'number': 'sing'}) == set(['dive']) + assert lemmatizer.noun('dive', {'number': 'plur'}) == set(['diva']) + + +def test_tagger_lemmatizer_base_form_verb(lemmatizer): + if lemmatizer is None: + return None + assert lemmatizer.verb('saw', {'verbform': 'past'}) == set(['see']) + + +def test_tagger_lemmatizer_punct(lemmatizer): + if lemmatizer is None: + return None + assert lemmatizer.punct('“') == set(['"']) + assert lemmatizer.punct('“') == set(['"']) + + +def test_tagger_lemmatizer_read_index(path): if path is not None: with (path / 'wordnet' / 'index.noun').open() as file_: index = read_index(file_) @@ -37,67 +46,18 @@ def test_read_index(path): assert 'plant' in index -def test_read_exc(path): +@pytest.mark.parametrize('text,lemma', [("was", "be")]) +def test_tagger_lemmatizer_read_exc(path, text, lemma): if path is not None: with (path / 'wordnet' / 'verb.exc').open() as file_: exc = read_exc(file_) - assert exc['was'] == ('be',) - - -def test_noun_lemmas(lemmatizer): - if lemmatizer is None: - return None - do = lemmatizer.noun - - assert do('aardwolves') == set(['aardwolf']) - assert do('aardwolf') == set(['aardwolf']) - assert do('planets') == set(['planet']) - assert do('ring') == set(['ring']) - assert do('axes') == set(['axis', 'axe', 'ax']) - - -def test_base_form_dive(lemmatizer): - if lemmatizer is None: - return None - - do = lemmatizer.noun - assert do('dive', {'number': 'sing'}) == set(['dive']) - assert do('dive', {'number': 'plur'}) == set(['diva']) - - -def test_base_form_saw(lemmatizer): - if lemmatizer is None: - return None - - do = lemmatizer.verb - assert do('saw', {'verbform': 'past'}) == set(['see']) - - -def test_smart_quotes(lemmatizer): - if lemmatizer is None: - return None - - do = lemmatizer.punct - assert do('“') == set(['"']) - assert do('“') == set(['"']) - - -def test_pickle_lemmatizer(lemmatizer): - if lemmatizer is None: - return None - - file_ = io.BytesIO() - pickle.dump(lemmatizer, file_) - - file_.seek(0) - - loaded = pickle.load(file_) + assert exc[text] == (lemma,) @pytest.mark.models -def test_tagger_lemma_assignment(EN): - tokens = u'Bananas in pyjamas are geese .'.split(' ') - doc = EN.tokenizer.tokens_from_list(tokens) - assert all( t.lemma_ == u'' for t in doc ) +def test_tagger_lemmatizer_lemma_assignment(EN): + text = "Bananas in pyjamas are geese." + doc = EN.tokenizer(text) + assert all(t.lemma_ == '' for t in doc) EN.tagger(doc) - assert all( t.lemma_ != u'' for t in doc ) + assert all(t.lemma_ != '' for t in doc)