From b7e3018d9707835bdbe3e2a6e61acd9858c3bc9f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 4 Aug 2020 14:31:32 +0200 Subject: [PATCH] Recalculate alignment if tokenization differs (#5868) * Recalculate alignment if tokenization differs * Refactor cached alignment data --- spacy/gold/example.pxd | 4 +++- spacy/gold/example.pyx | 28 ++++++++++++++++------------ spacy/tests/test_gold.py | 15 +++++++++++++++ 3 files changed, 34 insertions(+), 13 deletions(-) diff --git a/spacy/gold/example.pxd b/spacy/gold/example.pxd index 1f63b12d0..e06e36287 100644 --- a/spacy/gold/example.pxd +++ b/spacy/gold/example.pxd @@ -4,4 +4,6 @@ from ..tokens.doc cimport Doc cdef class Example: cdef readonly Doc x cdef readonly Doc y - cdef readonly object _alignment + cdef readonly object _cached_alignment + cdef readonly object _cached_words_x + cdef readonly object _cached_words_y diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 84d9f1622..8d320ce93 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -32,9 +32,9 @@ cdef class Example: raise TypeError(Errors.E972.format(arg="predicted")) if reference is None: raise TypeError(Errors.E972.format(arg="reference")) - self.x = predicted - self.y = reference - self._alignment = alignment + self.predicted = predicted + self.reference = reference + self._cached_alignment = alignment def __len__(self): return len(self.predicted) @@ -45,7 +45,8 @@ cdef class Example: def __set__(self, doc): self.x = doc - self._alignment = None + self._cached_alignment = None + self._cached_words_x = [t.text for t in doc] property reference: def __get__(self): @@ -53,7 +54,8 @@ cdef class Example: def __set__(self, doc): self.y = doc - self._alignment = None + self._cached_alignment = None + self._cached_words_y = [t.text for t in doc] def copy(self): return Example( @@ -79,13 +81,15 @@ cdef class Example: @property def alignment(self): - if self._alignment is None: - spacy_words = [token.orth_ for token in self.predicted] - gold_words = [token.orth_ for token in self.reference] - if gold_words == []: - gold_words = spacy_words - self._alignment = Alignment.from_strings(spacy_words, gold_words) - return self._alignment + words_x = [token.text for token in self.x] + words_y = [token.text for token in self.y] + if self._cached_alignment is None or \ + words_x != self._cached_words_x or \ + words_y != self._cached_words_y: + self._cached_alignment = Alignment.from_strings(words_x, words_y) + self._cached_words_x = words_x + self._cached_words_y = words_y + return self._cached_alignment def get_aligned(self, field, as_string=False): """Return an aligned array for a token attribute.""" diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index c44daf630..4c1ee3b82 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -655,3 +655,18 @@ def test_split_sents(merged_dict): assert token_annotation_2["words"] == ["It", "is", "just", "me"] assert token_annotation_2["tags"] == ["PRON", "AUX", "ADV", "PRON"] assert token_annotation_2["sent_starts"] == [1, 0, 0, 0] + + +def test_retokenized_docs(doc): + a = doc.to_array(["TAG"]) + doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a) + doc2 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a) + example = Example(doc1, doc2) + + assert example.get_aligned("ORTH", as_string=True) == ['Sarah', "'s", 'sister', 'flew', 'to', 'Silicon', 'Valley', 'via', 'London', '.'] + + with doc1.retokenize() as retokenizer: + retokenizer.merge(doc1[0:2]) + retokenizer.merge(doc1[5:7]) + + assert example.get_aligned("ORTH", as_string=True) == [None, 'sister', 'flew', 'to', None, 'via', 'London', '.']