From ba02957c80c49658f323c49df8d1f768a11e99b3 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 23 Jul 2019 18:28:55 +0200 Subject: [PATCH] Fix dependency copy for as_doc (#3969) * failing unit test for issue 3962 * attempt to fix Issue #3962 * create artificial unit test example * using length instead of self.length * sp * reformat with black * find better ancestor within span and use generic 'dep' * attach to span.root if there is no appropriate ancestor * comment span text * clean up ancestor code * reconstruct dep tree to keep same number of sentences --- spacy/tests/regression/test_issue3962.py | 112 +++++++++++++++++++++++ spacy/tokens/doc.pyx | 4 +- spacy/tokens/span.pyx | 46 +++++++++- 3 files changed, 157 insertions(+), 5 deletions(-) create mode 100644 spacy/tests/regression/test_issue3962.py diff --git a/spacy/tests/regression/test_issue3962.py b/spacy/tests/regression/test_issue3962.py new file mode 100644 index 000000000..c7979c2f3 --- /dev/null +++ b/spacy/tests/regression/test_issue3962.py @@ -0,0 +1,112 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + +from ..util import get_doc + + +@pytest.fixture +def doc(en_tokenizer): + text = "He jests at scars, that never felt a wound." + heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3] + deps = [ + "nsubj", + "ccomp", + "prep", + "pobj", + "punct", + "nsubj", + "neg", + "ROOT", + "det", + "dobj", + "punct", + ] + tokens = en_tokenizer(text) + return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) + + +def test_issue3962(doc): + """ Ensure that as_doc does not result in out-of-bound access of tokens. + This is achieved by setting the head to itself if it would lie out of the span otherwise.""" + span2 = doc[1:5] # "jests at scars ," + doc2 = span2.as_doc() + doc2_json = doc2.to_json() + assert doc2_json + + assert doc2[0].head.text == "jests" # head set to itself, being the new artificial root + assert doc2[0].dep_ == "dep" + assert doc2[1].head.text == "jests" + assert doc2[1].dep_ == "prep" + assert doc2[2].head.text == "at" + assert doc2[2].dep_ == "pobj" + assert doc2[3].head.text == "jests" # head set to the new artificial root + assert doc2[3].dep_ == "dep" + + # We should still have 1 sentence + assert len(list(doc2.sents)) == 1 + + span3 = doc[6:9] # "never felt a" + doc3 = span3.as_doc() + doc3_json = doc3.to_json() + assert doc3_json + + assert doc3[0].head.text == "felt" + assert doc3[0].dep_ == "neg" + assert doc3[1].head.text == "felt" + assert doc3[1].dep_ == "ROOT" + assert doc3[2].head.text == "felt" # head set to ancestor + assert doc3[2].dep_ == "dep" + + # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound" + assert len(list(doc3.sents)) == 1 + + +@pytest.fixture +def two_sent_doc(en_tokenizer): + text = "He jests at scars. They never felt a wound." + heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3] + deps = [ + "nsubj", + "ROOT", + "prep", + "pobj", + "punct", + "nsubj", + "neg", + "ROOT", + "det", + "dobj", + "punct", + ] + tokens = en_tokenizer(text) + return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) + + +def test_issue3962_long(two_sent_doc): + """ Ensure that as_doc does not result in out-of-bound access of tokens. + This is achieved by setting the head to itself if it would lie out of the span otherwise.""" + span2 = two_sent_doc[1:7] # "jests at scars. They never" + doc2 = span2.as_doc() + doc2_json = doc2.to_json() + assert doc2_json + + assert doc2[0].head.text == "jests" # head set to itself, being the new artificial root (in sentence 1) + assert doc2[0].dep_ == "ROOT" + assert doc2[1].head.text == "jests" + assert doc2[1].dep_ == "prep" + assert doc2[2].head.text == "at" + assert doc2[2].dep_ == "pobj" + assert doc2[3].head.text == "jests" + assert doc2[3].dep_ == "punct" + assert doc2[4].head.text == "They" # head set to itself, being the new artificial root (in sentence 2) + assert doc2[4].dep_ == "dep" + assert doc2[4].head.text == "They" # head set to the new artificial head (in sentence 2) + assert doc2[4].dep_ == "dep" + + # We should still have 2 sentences + sents = list(doc2.sents) + assert len(sents) == 2 + assert sents[0].text == "jests at scars ." + assert sents[1].text == "They never" diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index c1883f9c0..7ab1563e9 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -794,7 +794,7 @@ cdef class Doc: if array[i, col] != 0: self.vocab.morphology.assign_tag(&tokens[i], array[i, col]) # Now load the data - for i in range(self.length): + for i in range(length): token = &self.c[i] for j in range(n_attrs): if attr_ids[j] != TAG: @@ -804,7 +804,7 @@ cdef class Doc: self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs) # If document is parsed, set children if self.is_parsed: - set_children_from_heads(self.c, self.length) + set_children_from_heads(self.c, length) return self def get_lca_matrix(self): diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 3f4f4418b..42fb9852d 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -17,6 +17,7 @@ from ..attrs cimport attr_id_t from ..parts_of_speech cimport univ_pos_t from ..attrs cimport * from ..lexeme cimport Lexeme +from ..symbols cimport dep from ..util import normalize_slice from ..compat import is_config, basestring_ @@ -206,7 +207,6 @@ cdef class Span: DOCS: https://spacy.io/api/span#as_doc """ - # TODO: Fix! words = [t.text for t in self] spaces = [bool(t.whitespace_) for t in self] cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces) @@ -220,7 +220,9 @@ cdef class Span: else: array_head.append(SENT_START) array = self.doc.to_array(array_head) - doc.from_array(array_head, array[self.start : self.end]) + array = array[self.start : self.end] + self._fix_dep_copy(array_head, array) + doc.from_array(array_head, array) doc.noun_chunks_iterator = self.doc.noun_chunks_iterator doc.user_hooks = self.doc.user_hooks doc.user_span_hooks = self.doc.user_span_hooks @@ -235,6 +237,44 @@ cdef class Span: doc.cats[cat_label] = value return doc + def _fix_dep_copy(self, attrs, array): + """ Rewire dependency links to make sure their heads fall into the span + while still keeping the correct number of sentences. """ + cdef int length = len(array) + cdef attr_t value + cdef int i, head_col, ancestor_i + old_to_new_root = dict() + if HEAD in attrs: + head_col = attrs.index(HEAD) + for i in range(length): + # if the HEAD refers to a token outside this span, find a more appropriate ancestor + token = self[i] + ancestor_i = token.head.i - self.start # span offset + if ancestor_i not in range(length): + if DEP in attrs: + array[i, attrs.index(DEP)] = dep + + # try finding an ancestor within this span + ancestors = token.ancestors + for ancestor in ancestors: + ancestor_i = ancestor.i - self.start + if ancestor_i in range(length): + array[i, head_col] = ancestor_i - i + + # if there is no appropriate ancestor, define a new artificial root + value = array[i, head_col] + if (i+value) not in range(length): + new_root = old_to_new_root.get(ancestor_i, None) + if new_root is not None: + # take the same artificial root as a previous token from the same sentence + array[i, head_col] = new_root - i + else: + # set this token as the new artificial root + array[i, head_col] = 0 + old_to_new_root[ancestor_i] = i + + return array + def merge(self, *args, **attributes): """Retokenize the document, such that the span is merged into a single token. @@ -500,7 +540,7 @@ cdef class Span: if "root" in self.doc.user_span_hooks: return self.doc.user_span_hooks["root"](self) # This should probably be called 'head', and the other one called - # 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/ + # 'gov'. But we went with 'head' elsewhere, and now we're stuck =/ cdef int i # First, we scan through the Span, and check whether there's a word # with head==0, i.e. a sentence root. If so, we can return it. The