mirror of https://github.com/explosion/spaCy.git
* Improve efficiency of tagger, and improve morphological processing
This commit is contained in:
parent
6b34a2f34b
commit
42973c4b37
18
spacy/en.pxd
18
spacy/en.pxd
|
@ -125,23 +125,5 @@ cpdef enum:
|
|||
N_CONTEXT_FIELDS
|
||||
|
||||
|
||||
cdef inline void fill_pos_context(atom_t* context, const int i, const TokenC* tokens) nogil:
|
||||
_fill_from_token(&context[P2_sic], &tokens[i-2])
|
||||
_fill_from_token(&context[P1_sic], &tokens[i-1])
|
||||
_fill_from_token(&context[W_sic], &tokens[i])
|
||||
_fill_from_token(&context[N1_sic], &tokens[i+1])
|
||||
_fill_from_token(&context[N2_sic], &tokens[i+2])
|
||||
|
||||
|
||||
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
||||
context[0] = t.lex.sic
|
||||
context[1] = t.lex.cluster
|
||||
context[2] = t.lex.shape
|
||||
context[3] = t.lex.prefix
|
||||
context[4] = t.lex.suffix
|
||||
context[5] = t.pos
|
||||
context[6] = t.sense
|
||||
|
||||
|
||||
cdef class English(Language):
|
||||
pass
|
||||
|
|
44
spacy/en.pyx
44
spacy/en.pyx
|
@ -151,10 +151,14 @@ cdef class English(Language):
|
|||
cdef int i
|
||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||
cdef TokenC* t = tokens.data
|
||||
assert self.morphologizer is not None
|
||||
cdef dict tagdict = self.pos_tagger.tagdict
|
||||
for i in range(tokens.length):
|
||||
fill_pos_context(context, i, t)
|
||||
t[i].pos = self.pos_tagger.predict(context)
|
||||
if self.morphologizer:
|
||||
if t[i].lex.sic in tagdict:
|
||||
t[i].pos = tagdict[t[i].lex.sic]
|
||||
else:
|
||||
fill_pos_context(context, i, t)
|
||||
t[i].pos = self.pos_tagger.predict(context)
|
||||
self.morphologizer.set_morph(i, t)
|
||||
|
||||
def train_pos(self, Tokens tokens, golds):
|
||||
|
@ -165,27 +169,27 @@ cdef class English(Language):
|
|||
for i in range(tokens.length):
|
||||
fill_pos_context(context, i, t)
|
||||
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
|
||||
if self.morphologizer:
|
||||
self.morphologizer.set_morph(i, t)
|
||||
self.morphologizer.set_morph(i, t)
|
||||
c += t[i].pos == golds[i]
|
||||
return c
|
||||
|
||||
|
||||
cdef int _merge_morph(Morphology* tok_morph, const Morphology* pos_morph) except -1:
|
||||
if tok_morph.number == 0:
|
||||
tok_morph.number = pos_morph.number
|
||||
if tok_morph.tenspect == 0:
|
||||
tok_morph.tenspect = pos_morph.tenspect
|
||||
if tok_morph.mood == 0:
|
||||
tok_morph.mood = pos_morph.mood
|
||||
if tok_morph.gender == 0:
|
||||
tok_morph.gender = pos_morph.gender
|
||||
if tok_morph.person == 0:
|
||||
tok_morph.person = pos_morph.person
|
||||
if tok_morph.case == 0:
|
||||
tok_morph.case = pos_morph.case
|
||||
if tok_morph.misc == 0:
|
||||
tok_morph.misc = pos_morph.misc
|
||||
cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1:
|
||||
_fill_from_token(&context[P2_sic], &tokens[i-2])
|
||||
_fill_from_token(&context[P1_sic], &tokens[i-1])
|
||||
_fill_from_token(&context[W_sic], &tokens[i])
|
||||
_fill_from_token(&context[N1_sic], &tokens[i+1])
|
||||
_fill_from_token(&context[N2_sic], &tokens[i+2])
|
||||
|
||||
|
||||
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
||||
context[0] = t.lex.sic
|
||||
context[1] = t.lex.cluster
|
||||
context[2] = t.lex.shape
|
||||
context[3] = t.lex.prefix
|
||||
context[4] = t.lex.suffix
|
||||
context[5] = t.pos
|
||||
context[6] = t.sense
|
||||
|
||||
|
||||
EN = English('en')
|
||||
|
|
|
@ -35,8 +35,8 @@ cdef class Morphologizer:
|
|||
cdef StringStore strings
|
||||
cdef object lemmatizer
|
||||
cdef PosTag* tags
|
||||
cdef readonly list tag_names
|
||||
|
||||
cdef PreshMapArray _morph
|
||||
cdef PreshMapArray _lemmas
|
||||
cdef PreshMapArray _cache
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
|
||||
cdef int set_morph(self, const int i, TokenC* tokens) except -1
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
# cython: profile=True
|
||||
# cython: embedsignature=True
|
||||
from os import path
|
||||
import json
|
||||
|
||||
from .lemmatizer import Lemmatizer
|
||||
|
||||
from .typedefs cimport id_t
|
||||
|
||||
UNIV_TAGS = {
|
||||
'NULL': NO_TAG,
|
||||
|
@ -22,6 +24,11 @@ UNIV_TAGS = {
|
|||
}
|
||||
|
||||
|
||||
cdef struct _Cached:
|
||||
Morphology morph
|
||||
int lemma
|
||||
|
||||
|
||||
cdef class Morphologizer:
|
||||
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
|
||||
"""
|
||||
|
@ -30,12 +37,11 @@ cdef class Morphologizer:
|
|||
self.strings = strings
|
||||
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
|
||||
tag_map = cfg['tag_map']
|
||||
tag_names = cfg['tag_names']
|
||||
self.tag_names = cfg['tag_names']
|
||||
self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet'))
|
||||
self._lemmas = PreshMapArray(N_UNIV_TAGS)
|
||||
self._morph = PreshMapArray(len(tag_names))
|
||||
self.tags = <PosTag*>self.mem.alloc(len(tag_names), sizeof(PosTag))
|
||||
for i, tag in enumerate(tag_names):
|
||||
self._cache = PreshMapArray(len(self.tag_names))
|
||||
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
|
||||
for i, tag in enumerate(self.tag_names):
|
||||
pos, props = tag_map[tag]
|
||||
self.tags[i].id = i
|
||||
self.tags[i].pos = pos
|
||||
|
@ -46,15 +52,15 @@ cdef class Morphologizer:
|
|||
self.tags[i].morph.person = props.get('person', 0)
|
||||
self.tags[i].morph.case = props.get('case', 0)
|
||||
self.tags[i].morph.misc = props.get('misc', 0)
|
||||
if path.exists(path.join(data_dir, 'morph.json')):
|
||||
with open(path.join(data_dir, 'morph.json')) as file_:
|
||||
self.load_exceptions(json.loads(file_))
|
||||
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
||||
if self.lemmatizer is None:
|
||||
return lex.sic
|
||||
if pos != NOUN and pos != VERB and pos != ADJ:
|
||||
return lex.sic
|
||||
cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
|
||||
if lemma != 0:
|
||||
return lemma
|
||||
cdef bytes py_string = self.strings[lex.sic]
|
||||
cdef set lemma_strings
|
||||
cdef bytes lemma_string
|
||||
|
@ -67,15 +73,45 @@ cdef class Morphologizer:
|
|||
lemma_strings = self.lemmatizer.adj(py_string)
|
||||
lemma_string = sorted(lemma_strings)[0]
|
||||
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
|
||||
self._lemmas.set(pos, lex.sic, <void*>lemma)
|
||||
return lemma
|
||||
|
||||
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
|
||||
cdef const PosTag* tag = &self.tags[tokens[i].pos]
|
||||
tokens[i].lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
||||
morph = <Morphology*>self._morph.get(tag.id, tokens[i].lemma)
|
||||
if morph is NULL:
|
||||
self._morph.set(tag.id, tokens[i].lemma, <void*>&tag.morph)
|
||||
tokens[i].morph = tag.morph
|
||||
else:
|
||||
tokens[i].morph = morph[0]
|
||||
cached = <_Cached*>self._cache.get(tag.id, tokens[i].lex.sic)
|
||||
if cached is NULL:
|
||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
||||
cached.morph = tag.morph
|
||||
self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
|
||||
|
||||
tokens[i].lemma = cached.lemma
|
||||
tokens[i].morph = cached.morph
|
||||
|
||||
def load_exceptions(self, dict exc):
|
||||
cdef unicode pos_str
|
||||
cdef unicode form_str
|
||||
cdef unicode lemma_str
|
||||
cdef dict entries
|
||||
cdef dict props
|
||||
cdef int lemma
|
||||
cdef id_t sic
|
||||
cdef univ_tag_t pos
|
||||
for pos_str, entries in exc.items():
|
||||
pos = self.tag_names.index(pos_str)
|
||||
for form_str, props in entries.items():
|
||||
lemma_str = props.get('L', form_str)
|
||||
sic = self.strings[form_str]
|
||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||
cached.lemma = self.strings[lemma_str]
|
||||
set_morph_from_dict(&cached.morph, props)
|
||||
self._cache.set(pos, sic, <void*>cached)
|
||||
|
||||
|
||||
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
|
||||
morph.number = props.get('number', 0)
|
||||
morph.tenspect = props.get('tenspect', 0)
|
||||
morph.mood = props.get('mood', 0)
|
||||
morph.gender = props.get('gender', 0)
|
||||
morph.person = props.get('person', 0)
|
||||
morph.case = props.get('case', 0)
|
||||
morph.misc = props.get('misc', 0)
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
import unicodedata
|
||||
from unidecode import unidecode
|
||||
import re
|
||||
|
||||
import math
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
|
|||
|
||||
from preshed.maps cimport PreshMapArray
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from .typedefs cimport hash_t, id_t
|
||||
from .tokens cimport Tokens, Morphology
|
||||
|
||||
|
||||
|
|
|
@ -72,10 +72,9 @@ cdef class Tagger:
|
|||
return tag_id
|
||||
|
||||
|
||||
|
||||
def _make_tag_dict(counts):
|
||||
freq_thresh = 50
|
||||
ambiguity_thresh = 0.98
|
||||
freq_thresh = 20
|
||||
ambiguity_thresh = 0.97
|
||||
tagdict = {}
|
||||
cdef atom_t word
|
||||
cdef atom_t tag
|
||||
|
|
Loading…
Reference in New Issue