From 5e040855a5c5e7725fd875e4b85e38d53e113796 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 17:56:50 +1100 Subject: [PATCH 1/2] * Ensure morphological features and lemmas are loaded in from_array, re Issue #152 --- spacy/tests/serialize/test_io.py | 12 ++++++++++++ spacy/tokens/doc.pyx | 4 +++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/spacy/tests/serialize/test_io.py b/spacy/tests/serialize/test_io.py index a64d0cabc..4157ee309 100644 --- a/spacy/tests/serialize/test_io.py +++ b/spacy/tests/serialize/test_io.py @@ -38,3 +38,15 @@ def test_left_right(EN): for child in word.rights: assert child.head.i == word.i + +@pytest.mark.models +def test_lemmas(EN): + orig = EN(u'The geese are flying') + result = Doc(orig.vocab).from_bytes(orig.to_bytes()) + the, geese, are, flying = result + assert the.lemma_ == 'the' + assert geese.lemma_ == 'goose' + assert are.lemma_ == 'be' + assert flying.lemma_ == 'fly' + + diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 01ccb4fd9..2ad1a1d4a 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -398,7 +398,7 @@ cdef class Doc: self.is_parsed = True elif attr_id == TAG: for i in range(length): - tokens[i].tag = values[i] + self.vocab.morphology.assign_tag(&tokens[i], values[i]) if not self.is_tagged and tokens[i].tag != 0: self.is_tagged = True elif attr_id == POS: @@ -413,6 +413,8 @@ cdef class Doc: elif attr_id == ENT_TYPE: for i in range(length): tokens[i].ent_type = values[i] + else: + raise ValueError("Unknown attribute ID: %d" % attr_id) set_children_from_heads(self.data, self.length) return self From 604ceac4c651b7263d252f1bb53abea2f27a6739 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 17:57:51 +1100 Subject: [PATCH 2/2] * Fix morphological assignment in doc.merge() --- spacy/tokens/doc.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 2ad1a1d4a..7a8822b5f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -471,8 +471,7 @@ cdef class Doc: # Update fields token.lex = lex token.spacy = self.data[end-1].spacy - # What to do about morphology?? - # TODO: token.morph = ??? + self.vocab.morphology.assign_tag(token, self.vocab.strings[tag]) token.tag = self.vocab.strings[tag] token.lemma = self.vocab.strings[lemma] if ent_type == 'O':