diff --git a/spacy/en.pxd b/spacy/en.pxd index cee754d9c..4ac8a126d 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -125,23 +125,5 @@ cpdef enum: N_CONTEXT_FIELDS -cdef inline void fill_pos_context(atom_t* context, const int i, const TokenC* tokens) nogil: - _fill_from_token(&context[P2_sic], &tokens[i-2]) - _fill_from_token(&context[P1_sic], &tokens[i-1]) - _fill_from_token(&context[W_sic], &tokens[i]) - _fill_from_token(&context[N1_sic], &tokens[i+1]) - _fill_from_token(&context[N2_sic], &tokens[i+2]) - - -cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: - context[0] = t.lex.sic - context[1] = t.lex.cluster - context[2] = t.lex.shape - context[3] = t.lex.prefix - context[4] = t.lex.suffix - context[5] = t.pos - context[6] = t.sense - - cdef class English(Language): pass diff --git a/spacy/en.pyx b/spacy/en.pyx index 9cd2546cb..10773e0e2 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -151,10 +151,14 @@ cdef class English(Language): cdef int i cdef atom_t[N_CONTEXT_FIELDS] context cdef TokenC* t = tokens.data + assert self.morphologizer is not None + cdef dict tagdict = self.pos_tagger.tagdict for i in range(tokens.length): - fill_pos_context(context, i, t) - t[i].pos = self.pos_tagger.predict(context) - if self.morphologizer: + if t[i].lex.sic in tagdict: + t[i].pos = tagdict[t[i].lex.sic] + else: + fill_pos_context(context, i, t) + t[i].pos = self.pos_tagger.predict(context) self.morphologizer.set_morph(i, t) def train_pos(self, Tokens tokens, golds): @@ -165,27 +169,27 @@ cdef class English(Language): for i in range(tokens.length): fill_pos_context(context, i, t) t[i].pos = self.pos_tagger.predict(context, [golds[i]]) - if self.morphologizer: - self.morphologizer.set_morph(i, t) + self.morphologizer.set_morph(i, t) c += t[i].pos == golds[i] return c -cdef int _merge_morph(Morphology* tok_morph, const Morphology* pos_morph) except -1: - if tok_morph.number == 0: - tok_morph.number = pos_morph.number - if tok_morph.tenspect == 0: - tok_morph.tenspect = pos_morph.tenspect - if tok_morph.mood == 0: - tok_morph.mood = pos_morph.mood - if tok_morph.gender == 0: - tok_morph.gender = pos_morph.gender - if tok_morph.person == 0: - tok_morph.person = pos_morph.person - if tok_morph.case == 0: - tok_morph.case = pos_morph.case - if tok_morph.misc == 0: - tok_morph.misc = pos_morph.misc +cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1: + _fill_from_token(&context[P2_sic], &tokens[i-2]) + _fill_from_token(&context[P1_sic], &tokens[i-1]) + _fill_from_token(&context[W_sic], &tokens[i]) + _fill_from_token(&context[N1_sic], &tokens[i+1]) + _fill_from_token(&context[N2_sic], &tokens[i+2]) + + +cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: + context[0] = t.lex.sic + context[1] = t.lex.cluster + context[2] = t.lex.shape + context[3] = t.lex.prefix + context[4] = t.lex.suffix + context[5] = t.pos + context[6] = t.sense EN = English('en') diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 084cbbbe6..31cb08855 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -35,8 +35,8 @@ cdef class Morphologizer: cdef StringStore strings cdef object lemmatizer cdef PosTag* tags + cdef readonly list tag_names - cdef PreshMapArray _morph - cdef PreshMapArray _lemmas + cdef PreshMapArray _cache cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1 cdef int set_morph(self, const int i, TokenC* tokens) except -1 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 63c5ff827..b21a3ced4 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,8 +1,10 @@ +# cython: profile=True +# cython: embedsignature=True from os import path import json from .lemmatizer import Lemmatizer - +from .typedefs cimport id_t UNIV_TAGS = { 'NULL': NO_TAG, @@ -22,6 +24,11 @@ UNIV_TAGS = { } +cdef struct _Cached: + Morphology morph + int lemma + + cdef class Morphologizer: """Given a POS tag and a Lexeme, find its lemma and morphological analysis. """ @@ -30,12 +37,11 @@ cdef class Morphologizer: self.strings = strings cfg = json.load(open(path.join(data_dir, 'pos', 'config.json'))) tag_map = cfg['tag_map'] - tag_names = cfg['tag_names'] + self.tag_names = cfg['tag_names'] self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet')) - self._lemmas = PreshMapArray(N_UNIV_TAGS) - self._morph = PreshMapArray(len(tag_names)) - self.tags = self.mem.alloc(len(tag_names), sizeof(PosTag)) - for i, tag in enumerate(tag_names): + self._cache = PreshMapArray(len(self.tag_names)) + self.tags = self.mem.alloc(len(self.tag_names), sizeof(PosTag)) + for i, tag in enumerate(self.tag_names): pos, props = tag_map[tag] self.tags[i].id = i self.tags[i].pos = pos @@ -46,15 +52,15 @@ cdef class Morphologizer: self.tags[i].morph.person = props.get('person', 0) self.tags[i].morph.case = props.get('case', 0) self.tags[i].morph.misc = props.get('misc', 0) + if path.exists(path.join(data_dir, 'morph.json')): + with open(path.join(data_dir, 'morph.json')) as file_: + self.load_exceptions(json.loads(file_)) cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1: if self.lemmatizer is None: return lex.sic if pos != NOUN and pos != VERB and pos != ADJ: return lex.sic - cdef int lemma = self._lemmas.get(pos, lex.sic) - if lemma != 0: - return lemma cdef bytes py_string = self.strings[lex.sic] cdef set lemma_strings cdef bytes lemma_string @@ -67,15 +73,45 @@ cdef class Morphologizer: lemma_strings = self.lemmatizer.adj(py_string) lemma_string = sorted(lemma_strings)[0] lemma = self.strings.intern(lemma_string, len(lemma_string)).i - self._lemmas.set(pos, lex.sic, lemma) return lemma cdef int set_morph(self, const int i, TokenC* tokens) except -1: cdef const PosTag* tag = &self.tags[tokens[i].pos] - tokens[i].lemma = self.lemmatize(tag.pos, tokens[i].lex) - morph = self._morph.get(tag.id, tokens[i].lemma) - if morph is NULL: - self._morph.set(tag.id, tokens[i].lemma, &tag.morph) - tokens[i].morph = tag.morph - else: - tokens[i].morph = morph[0] + cached = <_Cached*>self._cache.get(tag.id, tokens[i].lex.sic) + if cached is NULL: + cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) + cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) + cached.morph = tag.morph + self._cache.set(tag.id, tokens[i].lex.sic, cached) + + tokens[i].lemma = cached.lemma + tokens[i].morph = cached.morph + + def load_exceptions(self, dict exc): + cdef unicode pos_str + cdef unicode form_str + cdef unicode lemma_str + cdef dict entries + cdef dict props + cdef int lemma + cdef id_t sic + cdef univ_tag_t pos + for pos_str, entries in exc.items(): + pos = self.tag_names.index(pos_str) + for form_str, props in entries.items(): + lemma_str = props.get('L', form_str) + sic = self.strings[form_str] + cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) + cached.lemma = self.strings[lemma_str] + set_morph_from_dict(&cached.morph, props) + self._cache.set(pos, sic, cached) + + +cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: + morph.number = props.get('number', 0) + morph.tenspect = props.get('tenspect', 0) + morph.mood = props.get('mood', 0) + morph.gender = props.get('gender', 0) + morph.person = props.get('person', 0) + morph.case = props.get('case', 0) + morph.misc = props.get('misc', 0) diff --git a/spacy/orth.py b/spacy/orth.py index 0462d15df..2400b38a6 100644 --- a/spacy/orth.py +++ b/spacy/orth.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import unicodedata from unidecode import unidecode +import re import math diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index 9abe25209..a896742ad 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -8,7 +8,7 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t from preshed.maps cimport PreshMapArray -from .typedefs cimport hash_t +from .typedefs cimport hash_t, id_t from .tokens cimport Tokens, Morphology diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index a1e51c5b5..9890e95e1 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -72,10 +72,9 @@ cdef class Tagger: return tag_id - def _make_tag_dict(counts): - freq_thresh = 50 - ambiguity_thresh = 0.98 + freq_thresh = 20 + ambiguity_thresh = 0.97 tagdict = {} cdef atom_t word cdef atom_t tag