mirror of https://github.com/explosion/spaCy.git
Use new Lemmatizer data and remove file import
Since there's currently only an English lemmatizer, the global Lemmatizer imports from spacy.en. This is unideal and still needs to be fixed.
This commit is contained in:
parent
0957737ee8
commit
1da29a7146
|
@ -4,32 +4,16 @@ import pathlib
|
|||
|
||||
import ujson as json
|
||||
|
||||
from .en.lemmatizer import INDEX, EXC, RULES
|
||||
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
|
||||
|
||||
|
||||
class Lemmatizer(object):
|
||||
@classmethod
|
||||
def load(cls, path, rules=None):
|
||||
index = {}
|
||||
exc = {}
|
||||
for pos in ['adj', 'noun', 'verb']:
|
||||
pos_index_path = path / 'wordnet' / 'index.{pos}'.format(pos=pos)
|
||||
if pos_index_path.exists():
|
||||
with pos_index_path.open() as file_:
|
||||
index[pos] = read_index(file_)
|
||||
else:
|
||||
index[pos] = set()
|
||||
pos_exc_path = path / 'wordnet' / '{pos}.exc'.format(pos=pos)
|
||||
if pos_exc_path.exists():
|
||||
with pos_exc_path.open() as file_:
|
||||
exc[pos] = read_exc(file_)
|
||||
else:
|
||||
exc[pos] = {}
|
||||
if rules is None and (path / 'vocab' / 'lemma_rules.json').exists():
|
||||
with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_:
|
||||
rules = json.load(file_)
|
||||
elif rules is None:
|
||||
rules = {}
|
||||
index = dict(INDEX)
|
||||
exc = dict(EXC)
|
||||
rules = dict(RULES)
|
||||
return cls(index, exc, rules)
|
||||
|
||||
def __init__(self, index, exceptions, rules):
|
||||
|
@ -94,25 +78,3 @@ def lemmatize(string, index, exceptions, rules):
|
|||
if not forms:
|
||||
forms.append(string)
|
||||
return set(forms)
|
||||
|
||||
|
||||
def read_index(fileobj):
|
||||
index = set()
|
||||
for line in fileobj:
|
||||
if line.startswith(' '):
|
||||
continue
|
||||
pieces = line.split()
|
||||
word = pieces[0]
|
||||
if word.count('_') == 0:
|
||||
index.add(word)
|
||||
return index
|
||||
|
||||
|
||||
def read_exc(fileobj):
|
||||
exceptions = {}
|
||||
for line in fileobj:
|
||||
if line.startswith(' '):
|
||||
continue
|
||||
pieces = line.split()
|
||||
exceptions[pieces[0]] = tuple(pieces[1:])
|
||||
return exceptions
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...lemmatizer import read_index, read_exc
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
|
@ -41,25 +39,6 @@ def test_tagger_lemmatizer_punct(lemmatizer):
|
|||
assert lemmatizer.punct('“') == set(['"'])
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_tagger_lemmatizer_read_index(path):
|
||||
if path is not None:
|
||||
with (path / 'wordnet' / 'index.noun').open() as file_:
|
||||
index = read_index(file_)
|
||||
assert 'man' in index
|
||||
assert 'plantes' not in index
|
||||
assert 'plant' in index
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
@pytest.mark.parametrize('text,lemma', [("was", "be")])
|
||||
def test_tagger_lemmatizer_read_exc(path, text, lemma):
|
||||
if path is not None:
|
||||
with (path / 'wordnet' / 'verb.exc').open() as file_:
|
||||
exc = read_exc(file_)
|
||||
assert exc[text] == (lemma,)
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_tagger_lemmatizer_lemma_assignment(EN):
|
||||
text = "Bananas in pyjamas are geese."
|
||||
|
|
Loading…
Reference in New Issue