Fix loading of morphology exceptions

This commit is contained in:
Matthew Honnibal 2017-06-04 16:34:32 -05:00
parent bb98d45a63
commit b78cc318c3
3 changed files with 17 additions and 5 deletions

View File

@ -30,6 +30,7 @@ cdef class Morphology:
cdef public object n_tags cdef public object n_tags
cdef public object reverse_index cdef public object reverse_index
cdef public object tag_names cdef public object tag_names
cdef public object exc
cdef RichTagC* rich_tags cdef RichTagC* rich_tags
cdef PreshMapArray _cache cdef PreshMapArray _cache

View File

@ -33,7 +33,7 @@ def _normalize_props(props):
cdef class Morphology: cdef class Morphology:
def __init__(self, StringStore string_store, tag_map, lemmatizer): def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
self.mem = Pool() self.mem = Pool()
self.strings = string_store self.strings = string_store
self.tag_map = {} self.tag_map = {}
@ -53,9 +53,14 @@ cdef class Morphology:
self.rich_tags[i].pos = attrs[POS] self.rich_tags[i].pos = attrs[POS]
self.reverse_index[self.rich_tags[i].name] = i self.reverse_index[self.rich_tags[i].name] = i
self._cache = PreshMapArray(self.n_tags) self._cache = PreshMapArray(self.n_tags)
self.exc = {}
if exc is not None:
for (tag_str, orth_str), attrs in exc.items():
self.add_special_case(tag_str, orth_str, attrs)
def __reduce__(self): def __reduce__(self):
return (Morphology, (self.strings, self.tag_map, self.lemmatizer), None, None) return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
self.exc), None, None)
cdef int assign_tag(self, TokenC* token, tag) except -1: cdef int assign_tag(self, TokenC* token, tag) except -1:
if isinstance(tag, basestring): if isinstance(tag, basestring):
@ -106,6 +111,7 @@ cdef class Morphology:
tag (unicode): The part-of-speech tag to key the exception. tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception. orth (unicode): The word-form to key the exception.
""" """
self.exc[(tag_str, orth_str)] = dict(attrs)
tag = self.strings.add(tag_str) tag = self.strings.add(tag_str)
tag_id = self.reverse_index[tag] tag_id = self.reverse_index[tag]
orth = self.strings[orth_str] orth = self.strings[orth_str]

View File

@ -286,7 +286,8 @@ class NeuralTagger(object):
cdef Vocab vocab = self.vocab cdef Vocab vocab = self.vocab
if new_tag_map: if new_tag_map:
vocab.morphology = Morphology(vocab.strings, new_tag_map, vocab.morphology = Morphology(vocab.strings, new_tag_map,
vocab.morphology.lemmatizer) vocab.morphology.lemmatizer,
exc=vocab.morphology.exc)
token_vector_width = pipeline[0].model.nO token_vector_width = pipeline[0].model.nO
if self.model is True: if self.model is True:
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
@ -322,7 +323,9 @@ class NeuralTagger(object):
tag_map = msgpack.loads(b, encoding='utf8') tag_map = msgpack.loads(b, encoding='utf8')
self.vocab.morphology = Morphology( self.vocab.morphology = Morphology(
self.vocab.strings, tag_map=tag_map, self.vocab.strings, tag_map=tag_map,
lemmatizer=self.vocab.morphology.lemmatizer) lemmatizer=self.vocab.morphology.lemmatizer,
exc=self.vocab.morphology.exc)
deserialize = OrderedDict(( deserialize = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)), ('vocab', lambda b: self.vocab.from_bytes(b)),
('tag_map', load_tag_map), ('tag_map', load_tag_map),
@ -354,7 +357,9 @@ class NeuralTagger(object):
tag_map = msgpack.loads(file_.read(), encoding='utf8') tag_map = msgpack.loads(file_.read(), encoding='utf8')
self.vocab.morphology = Morphology( self.vocab.morphology = Morphology(
self.vocab.strings, tag_map=tag_map, self.vocab.strings, tag_map=tag_map,
lemmatizer=self.vocab.morphology.lemmatizer) lemmatizer=self.vocab.morphology.lemmatizer,
exc=self.vocab.morphology.exc)
deserialize = OrderedDict(( deserialize = OrderedDict((
('vocab', lambda p: self.vocab.from_disk(p)), ('vocab', lambda p: self.vocab.from_disk(p)),