From 8308c1525e921c7655470fb1b2e255c37dca68b3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 25 Sep 2018 15:18:21 +0200 Subject: [PATCH] Fix exception loading --- spacy/lemmatizer.py | 8 ++--- spacy/morphology.pxd | 2 ++ spacy/morphology.pyx | 85 ++++++++++++++++++++++++-------------------- 3 files changed, 53 insertions(+), 42 deletions(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 93121a0c5..483debb67 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -60,13 +60,13 @@ class Lemmatizer(object): return True elif univ_pos == 'adj' and morphology.get('Degree') == 'pos': return True - elif VerbForm_inf in morphology: + elif VerbForm_inf in morphology or 'VerbForm_inf' in morphology: return True - elif VerbForm_none in morphology: + elif VerbForm_none in morphology or 'VerbForm_none' in morphology: return True - elif Number_sing in morphology: + elif Number_sing in morphology or 'Number_sing' in morphology: return True - elif Degree_pos in morphology: + elif Degree_pos in morphology or 'Degree_pos' in morphology: return True else: return False diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 05bc8ccc0..7ba84d40c 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -30,6 +30,8 @@ cdef class Morphology: cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 cdef update_morph(self, hash_t morph, features) + cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1 + cdef enum univ_morph_t: NIL = 0 Animacy_anim = symbols.Animacy_anim diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 35571af49..f314a91a3 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -5,6 +5,7 @@ from __future__ import unicode_literals from libc.string cimport memset import ujson as json +from . import symbols from .attrs cimport POS, IS_SPACE from .attrs import LEMMA, intify_attrs from .parts_of_speech cimport SPACE @@ -17,6 +18,24 @@ from .errors import Errors def _normalize_props(props): """Transform deprecated string keys to correct names.""" out = {} + morph_keys = [ + 'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number', + 'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss', + 'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType', + 'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr', + 'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm', + 'NumValue', 'PartType', 'Polite', 'StyleVariant', + 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType', + 'Reflex', 'Negative', 'Mood', 'Aspect', 'Case', + 'Polarity', 'PrepCase', 'Animacy' # U20 + ] + props = dict(props) + for key in morph_keys: + if key in props: + attr = '%s_%s' % (key, props[key]) + if attr in IDS: + props.pop(key) + props[attr] = True for key, value in props.items(): if key == POS: if hasattr(value, 'upper'): @@ -58,15 +77,16 @@ cdef class Morphology: self.n_tags = len(tag_map) self.reverse_index = {} for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): - print(tag_str, attrs) + attrs = _normalize_props(attrs) self.tag_map[tag_str] = dict(attrs) self.reverse_index[self.strings.add(tag_str)] = i self._cache = PreshMapArray(self.n_tags) self.exc = {} if exc is not None: - for (tag_str, orth_str), attrs in exc.items(): - self.add_special_case(tag_str, orth_str, attrs) + for (tag, orth), attrs in exc.items(): + self.add_special_case( + self.strings.as_string(tag), self.strings.as_string(orth), attrs) def __reduce__(self): return (Morphology, (self.strings, self.tag_map, self.lemmatizer, @@ -102,37 +122,10 @@ cdef class Morphology: tag (unicode): The part-of-speech tag to key the exception. orth (unicode): The word-form to key the exception. """ - pass - ## TODO: Currently we've assumed that we know the number of tags -- - ## RichTagC is an array, and _cache is a PreshMapArray - ## This is really bad: it makes the morphology typed to the tagger - ## classes, which is all wrong. - #self.exc[(tag_str, orth_str)] = dict(attrs) - #tag = self.strings.add(tag_str) - #if tag not in self.reverse_index: - # return - #tag_id = self.reverse_index[tag] - #orth = self.strings[orth_str] - #cdef RichTagC rich_tag = self.rich_tags[tag_id] - #attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) - #cached = self._cache.get(tag_id, orth) - #if cached is NULL: - # cached = self.mem.alloc(1, sizeof(MorphAnalysisC)) - #elif force: - # memset(cached, 0, sizeof(cached[0])) - #else: - # raise ValueError(Errors.E015.format(tag=tag_str, orth=orth_str)) - - #cached.tag = rich_tag - ## TODO: Refactor this to take arbitrary attributes. - #for name_id, value_id in attrs.items(): - # if name_id == LEMMA: - # cached.lemma = value_id - # else: - # self.assign_feature(&cached.tag.morph, name_id, value_id) - #if cached.lemma == 0: - # cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs) - #self._cache.set(tag_id, orth, cached) + attrs = dict(attrs) + attrs = _normalize_props(attrs) + attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) + self.exc[(tag_str, self.strings.add(orth_str))] = attrs cdef hash_t insert(self, RichTagC tag) except 0: cdef hash_t key = hash_tag(tag) @@ -171,17 +164,27 @@ cdef class Morphology: tag_id = self.reverse_index[self.strings.add('_SP')] tag_str = self.tag_names[tag_id] features = dict(self.tag_map.get(tag_str, {})) - cdef attr_t lemma = self._cache.get(tag_id, token.lex.orth) - if lemma == 0 and features: + if features: pos = self.strings.as_int(features.pop(POS)) - lemma = self.lemmatize(pos, token.lex.orth, features) - self._cache.set(tag_id, token.lex.orth, lemma) else: pos = 0 + cdef attr_t lemma = self._cache.get(tag_id, token.lex.orth) + if lemma == 0: + lemma = self.lemmatize(pos, token.lex.orth, features) + self._cache.set(tag_id, token.lex.orth, lemma) token.lemma = lemma token.pos = pos token.tag = self.strings[tag_str] token.morph = self.add(features) + if (self.tag_names[tag_id], token.lex.orth) in self.exc: + self._assign_tag_from_exceptions(token, tag_id) + + cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1: + key = (self.tag_names[tag_id], token.lex.orth) + cdef dict attrs + attrs = self.exc[key] + token.pos = attrs.get(POS, token.pos) + token.lemma = attrs.get(LEMMA, token.lemma) cdef update_morph(self, hash_t morph, features): """Update a morphological analysis with new feature values.""" @@ -194,6 +197,12 @@ cdef class Morphology: morph = self.insert_tag(tag) return morph + def load_morph_exceptions(self, dict exc): + # Map (form, pos) to (lemma, rich tag) + for tag_str, entries in exc.items(): + for form_str, attrs in entries.items(): + self.add_special_case(tag_str, form_str, attrs) + def to_bytes(self): json_tags = [] for key in self.tags: