mirror of https://github.com/explosion/spaCy.git
120 lines
4.1 KiB
Cython
120 lines
4.1 KiB
Cython
# cython: profile=True
|
|
# cython: embedsignature=True
|
|
from os import path
|
|
import json
|
|
|
|
from .lemmatizer import Lemmatizer
|
|
from .typedefs cimport id_t
|
|
from . import util
|
|
|
|
|
|
UNIV_TAGS = {
|
|
'NULL': NO_TAG,
|
|
'ADJ': ADJ,
|
|
'ADV': ADV,
|
|
'ADP': ADP,
|
|
'CONJ': CONJ,
|
|
'DET': DET,
|
|
'NOUN': NOUN,
|
|
'NUM': NUM,
|
|
'PRON': PRON,
|
|
'PRT': PRT,
|
|
'VERB': VERB,
|
|
'X': X,
|
|
'.': PUNCT,
|
|
'EOL': EOL
|
|
}
|
|
|
|
|
|
cdef struct _Cached:
|
|
Morphology morph
|
|
int lemma
|
|
|
|
|
|
cdef class Morphologizer:
|
|
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
|
|
"""
|
|
def __init__(self, StringStore strings, data_dir):
|
|
self.mem = Pool()
|
|
self.strings = strings
|
|
cfg = json.load(open(path.join(data_dir, 'config.json')))
|
|
tag_map = cfg['tag_map']
|
|
self.tag_names = cfg['tag_names']
|
|
self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet'))
|
|
self._cache = PreshMapArray(len(self.tag_names))
|
|
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
|
|
for i, tag in enumerate(self.tag_names):
|
|
pos, props = tag_map[tag]
|
|
self.tags[i].id = i
|
|
self.tags[i].pos = pos
|
|
self.tags[i].morph.number = props.get('number', 0)
|
|
self.tags[i].morph.tenspect = props.get('tenspect', 0)
|
|
self.tags[i].morph.mood = props.get('mood', 0)
|
|
self.tags[i].morph.gender = props.get('gender', 0)
|
|
self.tags[i].morph.person = props.get('person', 0)
|
|
self.tags[i].morph.case = props.get('case', 0)
|
|
self.tags[i].morph.misc = props.get('misc', 0)
|
|
if path.exists(path.join(data_dir, 'morphs.json')):
|
|
with open(path.join(data_dir, 'morphs.json')) as file_:
|
|
self.load_exceptions(json.load(file_))
|
|
|
|
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
|
if self.lemmatizer is None:
|
|
return lex.sic
|
|
if pos != NOUN and pos != VERB and pos != ADJ:
|
|
return lex.sic
|
|
cdef bytes py_string = self.strings[lex.sic]
|
|
cdef set lemma_strings
|
|
cdef bytes lemma_string
|
|
if pos == NOUN:
|
|
lemma_strings = self.lemmatizer.noun(py_string)
|
|
elif pos == VERB:
|
|
lemma_strings = self.lemmatizer.verb(py_string)
|
|
else:
|
|
assert pos == ADJ
|
|
lemma_strings = self.lemmatizer.adj(py_string)
|
|
lemma_string = sorted(lemma_strings)[0]
|
|
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
|
|
return lemma
|
|
|
|
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
|
|
cdef const PosTag* tag = &self.tags[tokens[i].pos]
|
|
cached = <_Cached*>self._cache.get(tag.id, tokens[i].lex.sic)
|
|
if cached is NULL:
|
|
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
|
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
|
cached.morph = tag.morph
|
|
self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
|
|
|
|
tokens[i].lemma = cached.lemma
|
|
tokens[i].morph = cached.morph
|
|
|
|
def load_exceptions(self, dict exc):
|
|
cdef unicode pos_str
|
|
cdef unicode form_str
|
|
cdef unicode lemma_str
|
|
cdef dict entries
|
|
cdef dict props
|
|
cdef int lemma
|
|
cdef id_t sic
|
|
cdef univ_tag_t pos
|
|
for pos_str, entries in exc.items():
|
|
pos = self.tag_names.index(pos_str)
|
|
for form_str, props in entries.items():
|
|
lemma_str = props.get('L', form_str)
|
|
sic = self.strings[form_str]
|
|
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
|
cached.lemma = self.strings[lemma_str]
|
|
set_morph_from_dict(&cached.morph, props)
|
|
self._cache.set(pos, sic, <void*>cached)
|
|
|
|
|
|
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
|
|
morph.number = props.get('number', 0)
|
|
morph.tenspect = props.get('tenspect', 0)
|
|
morph.mood = props.get('mood', 0)
|
|
morph.gender = props.get('gender', 0)
|
|
morph.person = props.get('person', 0)
|
|
morph.case = props.get('case', 0)
|
|
morph.misc = props.get('misc', 0)
|