mirror of https://github.com/explosion/spaCy.git
Use new Lemmatizer data and remove file import
Since there's currently only an English lemmatizer, the global Lemmatizer imports from spacy.en. This is unideal and still needs to be fixed.
This commit is contained in:
parent
0957737ee8
commit
1da29a7146
|
@ -4,32 +4,16 @@ import pathlib
|
||||||
|
|
||||||
import ujson as json
|
import ujson as json
|
||||||
|
|
||||||
|
from .en.lemmatizer import INDEX, EXC, RULES
|
||||||
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
|
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
|
||||||
|
|
||||||
|
|
||||||
class Lemmatizer(object):
|
class Lemmatizer(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, rules=None):
|
def load(cls, path, rules=None):
|
||||||
index = {}
|
index = dict(INDEX)
|
||||||
exc = {}
|
exc = dict(EXC)
|
||||||
for pos in ['adj', 'noun', 'verb']:
|
rules = dict(RULES)
|
||||||
pos_index_path = path / 'wordnet' / 'index.{pos}'.format(pos=pos)
|
|
||||||
if pos_index_path.exists():
|
|
||||||
with pos_index_path.open() as file_:
|
|
||||||
index[pos] = read_index(file_)
|
|
||||||
else:
|
|
||||||
index[pos] = set()
|
|
||||||
pos_exc_path = path / 'wordnet' / '{pos}.exc'.format(pos=pos)
|
|
||||||
if pos_exc_path.exists():
|
|
||||||
with pos_exc_path.open() as file_:
|
|
||||||
exc[pos] = read_exc(file_)
|
|
||||||
else:
|
|
||||||
exc[pos] = {}
|
|
||||||
if rules is None and (path / 'vocab' / 'lemma_rules.json').exists():
|
|
||||||
with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_:
|
|
||||||
rules = json.load(file_)
|
|
||||||
elif rules is None:
|
|
||||||
rules = {}
|
|
||||||
return cls(index, exc, rules)
|
return cls(index, exc, rules)
|
||||||
|
|
||||||
def __init__(self, index, exceptions, rules):
|
def __init__(self, index, exceptions, rules):
|
||||||
|
@ -94,25 +78,3 @@ def lemmatize(string, index, exceptions, rules):
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.append(string)
|
forms.append(string)
|
||||||
return set(forms)
|
return set(forms)
|
||||||
|
|
||||||
|
|
||||||
def read_index(fileobj):
|
|
||||||
index = set()
|
|
||||||
for line in fileobj:
|
|
||||||
if line.startswith(' '):
|
|
||||||
continue
|
|
||||||
pieces = line.split()
|
|
||||||
word = pieces[0]
|
|
||||||
if word.count('_') == 0:
|
|
||||||
index.add(word)
|
|
||||||
return index
|
|
||||||
|
|
||||||
|
|
||||||
def read_exc(fileobj):
|
|
||||||
exceptions = {}
|
|
||||||
for line in fileobj:
|
|
||||||
if line.startswith(' '):
|
|
||||||
continue
|
|
||||||
pieces = line.split()
|
|
||||||
exceptions[pieces[0]] = tuple(pieces[1:])
|
|
||||||
return exceptions
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...lemmatizer import read_index, read_exc
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@ -41,25 +39,6 @@ def test_tagger_lemmatizer_punct(lemmatizer):
|
||||||
assert lemmatizer.punct('“') == set(['"'])
|
assert lemmatizer.punct('“') == set(['"'])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_tagger_lemmatizer_read_index(path):
|
|
||||||
if path is not None:
|
|
||||||
with (path / 'wordnet' / 'index.noun').open() as file_:
|
|
||||||
index = read_index(file_)
|
|
||||||
assert 'man' in index
|
|
||||||
assert 'plantes' not in index
|
|
||||||
assert 'plant' in index
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
@pytest.mark.parametrize('text,lemma', [("was", "be")])
|
|
||||||
def test_tagger_lemmatizer_read_exc(path, text, lemma):
|
|
||||||
if path is not None:
|
|
||||||
with (path / 'wordnet' / 'verb.exc').open() as file_:
|
|
||||||
exc = read_exc(file_)
|
|
||||||
assert exc[text] == (lemma,)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models
|
||||||
def test_tagger_lemmatizer_lemma_assignment(EN):
|
def test_tagger_lemmatizer_lemma_assignment(EN):
|
||||||
text = "Bananas in pyjamas are geese."
|
text = "Bananas in pyjamas are geese."
|
||||||
|
|
Loading…
Reference in New Issue