Fix loading of morphology exceptions

2017-06-04 16:34:32 -05:00 · 2017-06-04 16:34:32 -05:00 · b78cc318c3
parent bb98d45a63
commit b78cc318c3
3 changed files with 17 additions and 5 deletions
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -30,6 +30,7 @@ cdef class Morphology:
    cdef public object n_tags
    cdef public object reverse_index
    cdef public object tag_names
+    cdef public object exc

    cdef RichTagC* rich_tags
    cdef PreshMapArray _cache
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -33,7 +33,7 @@ def _normalize_props(props):


 cdef class Morphology:
-    def __init__(self, StringStore string_store, tag_map, lemmatizer):
+    def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
        self.mem = Pool()
        self.strings = string_store
        self.tag_map = {}
@ -53,9 +53,14 @@ cdef class Morphology:
            self.rich_tags[i].pos = attrs[POS]
            self.reverse_index[self.rich_tags[i].name] = i
        self._cache = PreshMapArray(self.n_tags)
+        self.exc = {}
+        if exc is not None:
+            for (tag_str, orth_str), attrs in exc.items():
+                self.add_special_case(tag_str, orth_str, attrs)

    def __reduce__(self):
-        return (Morphology, (self.strings, self.tag_map, self.lemmatizer), None, None)
+        return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
+                             self.exc), None, None)

    cdef int assign_tag(self, TokenC* token, tag) except -1:
        if isinstance(tag, basestring):
@ -106,6 +111,7 @@ cdef class Morphology:
            tag (unicode): The part-of-speech tag to key the exception.
            orth (unicode): The word-form to key the exception.
        """
+        self.exc[(tag_str, orth_str)] = dict(attrs)
        tag = self.strings.add(tag_str)
        tag_id = self.reverse_index[tag]
        orth = self.strings[orth_str]
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -286,7 +286,8 @@ class NeuralTagger(object):
        cdef Vocab vocab = self.vocab
        if new_tag_map:
            vocab.morphology = Morphology(vocab.strings, new_tag_map,
-                                          vocab.morphology.lemmatizer)
+                                          vocab.morphology.lemmatizer,
+                                          exc=vocab.morphology.exc)
        token_vector_width = pipeline[0].model.nO
        if self.model is True:
            self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
@ -322,7 +323,9 @@ class NeuralTagger(object):
            tag_map = msgpack.loads(b, encoding='utf8')
            self.vocab.morphology = Morphology(
                self.vocab.strings, tag_map=tag_map,
-                lemmatizer=self.vocab.morphology.lemmatizer)
+                lemmatizer=self.vocab.morphology.lemmatizer,
+                exc=self.vocab.morphology.exc)
+ 
        deserialize = OrderedDict((
            ('vocab', lambda b: self.vocab.from_bytes(b)),
            ('tag_map', load_tag_map),
@ -354,7 +357,9 @@ class NeuralTagger(object):
                tag_map = msgpack.loads(file_.read(), encoding='utf8')
            self.vocab.morphology = Morphology(
                self.vocab.strings, tag_map=tag_map,
-                lemmatizer=self.vocab.morphology.lemmatizer)
+                lemmatizer=self.vocab.morphology.lemmatizer,
+                exc=self.vocab.morphology.exc)
+ 

        deserialize = OrderedDict((
            ('vocab', lambda p: self.vocab.from_disk(p)),