From 80b94313b6bf71516fe68e4ffdd02c1015f4436b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 11 Mar 2019 01:31:21 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=92=AB=20Fix=20interaction=20of=20lemmati?= =?UTF-8?q?zer=20and=20tokenizer=20exceptions=20(#3388)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #2203. Closes #3268. Lemmas set from outside the `Morphology` class were being overwritten. The result was especially confusing when deserialising, as it meant some lemmas could change when storing and retrieving a `Doc` object. This PR applies two fixes: 1) When we go to set the lemma in the `Morphology` class, first check whether a lemma is already set. If so, don't overwrite. 2) When we load with `doc.from_array()`, take care to apply the `TAG` field first. This allows other fields to overwrite the `TAG` implied properties, if they're provided explicitly (e.g. the `LEMMA`). ## Checklist - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. --- spacy/morphology.pyx | 3 ++- spacy/tests/regression/test_issue2001-2500.py | 21 +++++++++++++++++++ spacy/tokens/doc.pyx | 15 ++++++------- 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index bd821d76f..ed1ee9a7e 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -110,7 +110,8 @@ cdef class Morphology: analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth, self.tag_map.get(tag_str, {})) self._cache.set(tag_id, token.lex.orth, analysis) - token.lemma = analysis.lemma + if token.lemma == 0: + token.lemma = analysis.lemma token.pos = analysis.tag.pos token.tag = analysis.tag.name token.morph = analysis.tag.morph diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index ed1c89671..df5d76641 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import pytest +import numpy from spacy.tokens import Doc from spacy.displacy import render from spacy.gold import iob_to_biluo @@ -39,6 +40,26 @@ def test_issue2179(): assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",) +def test_issue2203(en_vocab): + """Test that lemmas are set correctly in doc.from_array.""" + words = ["I", "'ll", "survive"] + tags = ["PRP", "MD", "VB"] + lemmas = ["-PRON-", "will", "survive"] + tag_ids = [en_vocab.strings.add(tag) for tag in tags] + lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas] + doc = Doc(en_vocab, words=words) + # Work around lemma corrpution problem and set lemmas after tags + doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64")) + doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64")) + assert [t.tag_ for t in doc] == tags + assert [t.lemma_ for t in doc] == lemmas + # We need to serialize both tag and lemma, since this is what causes the bug + doc_array = doc.to_array(["TAG", "LEMMA"]) + new_doc = Doc(doc.vocab, words=words).from_array(["TAG", "LEMMA"], doc_array) + assert [t.tag_ for t in new_doc] == tags + assert [t.lemma_ for t in new_doc] == lemmas + + def test_issue2219(en_vocab): vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])] add_vecs_to_vocab(en_vocab, vectors) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 483fa6a10..4d3ed084a 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -763,17 +763,18 @@ cdef class Doc: attr_ids[i] = attr_id if len(array.shape) == 1: array = array.reshape((array.size, 1)) + # Do TAG first. This lets subsequent loop override stuff like POS, LEMMA + if TAG in attrs: + col = attrs.index(TAG) + for i in range(length): + if array[i, col] != 0: + self.vocab.morphology.assign_tag(&tokens[i], array[i, col]) # Now load the data for i in range(self.length): token = &self.c[i] for j in range(n_attrs): - Token.set_struct_attr(token, attr_ids[j], array[i, j]) - # Auxiliary loading logic - for col, attr_id in enumerate(attrs): - if attr_id == TAG: - for i in range(length): - if array[i, col] != 0: - self.vocab.morphology.assign_tag(&tokens[i], array[i, col]) + if attr_ids[j] != TAG: + Token.set_struct_attr(token, attr_ids[j], array[i, j]) # Set flags self.is_parsed = bool(self.is_parsed or HEAD in attrs or DEP in attrs) self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)