diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 4d981b30d..922843d6d 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -30,6 +30,7 @@ cdef class Morphology: cdef public object n_tags cdef public object reverse_index cdef public object tag_names + cdef public object exc cdef RichTagC* rich_tags cdef PreshMapArray _cache diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index b79fcaeef..13a0ed8e3 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -33,7 +33,7 @@ def _normalize_props(props): cdef class Morphology: - def __init__(self, StringStore string_store, tag_map, lemmatizer): + def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None): self.mem = Pool() self.strings = string_store self.tag_map = {} @@ -53,9 +53,14 @@ cdef class Morphology: self.rich_tags[i].pos = attrs[POS] self.reverse_index[self.rich_tags[i].name] = i self._cache = PreshMapArray(self.n_tags) + self.exc = {} + if exc is not None: + for (tag_str, orth_str), attrs in exc.items(): + self.add_special_case(tag_str, orth_str, attrs) def __reduce__(self): - return (Morphology, (self.strings, self.tag_map, self.lemmatizer), None, None) + return (Morphology, (self.strings, self.tag_map, self.lemmatizer, + self.exc), None, None) cdef int assign_tag(self, TokenC* token, tag) except -1: if isinstance(tag, basestring): @@ -106,6 +111,7 @@ cdef class Morphology: tag (unicode): The part-of-speech tag to key the exception. orth (unicode): The word-form to key the exception. """ + self.exc[(tag_str, orth_str)] = dict(attrs) tag = self.strings.add(tag_str) tag_id = self.reverse_index[tag] orth = self.strings[orth_str] diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 93955848f..29c7394ec 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -286,7 +286,8 @@ class NeuralTagger(object): cdef Vocab vocab = self.vocab if new_tag_map: vocab.morphology = Morphology(vocab.strings, new_tag_map, - vocab.morphology.lemmatizer) + vocab.morphology.lemmatizer, + exc=vocab.morphology.exc) token_vector_width = pipeline[0].model.nO if self.model is True: self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) @@ -322,7 +323,9 @@ class NeuralTagger(object): tag_map = msgpack.loads(b, encoding='utf8') self.vocab.morphology = Morphology( self.vocab.strings, tag_map=tag_map, - lemmatizer=self.vocab.morphology.lemmatizer) + lemmatizer=self.vocab.morphology.lemmatizer, + exc=self.vocab.morphology.exc) + deserialize = OrderedDict(( ('vocab', lambda b: self.vocab.from_bytes(b)), ('tag_map', load_tag_map), @@ -354,7 +357,9 @@ class NeuralTagger(object): tag_map = msgpack.loads(file_.read(), encoding='utf8') self.vocab.morphology = Morphology( self.vocab.strings, tag_map=tag_map, - lemmatizer=self.vocab.morphology.lemmatizer) + lemmatizer=self.vocab.morphology.lemmatizer, + exc=self.vocab.morphology.exc) + deserialize = OrderedDict(( ('vocab', lambda p: self.vocab.from_disk(p)),