From cbd2794be0da732955418ff2680b91fc472e63ef Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 25 Mar 2018 22:16:19 +0200 Subject: [PATCH 1/5] Add test for ent_iob during span merge --- spacy/tests/doc/test_span_merge.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/spacy/tests/doc/test_span_merge.py b/spacy/tests/doc/test_span_merge.py index 61f8ca50d..ae1f4f4a1 100644 --- a/spacy/tests/doc/test_span_merge.py +++ b/spacy/tests/doc/test_span_merge.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals from ..util import get_doc +from ...vocab import Vocab +from ...tokens import Doc import pytest @@ -95,6 +97,21 @@ def test_spans_entity_merge(en_tokenizer): assert len(doc) == 15 +def test_spans_entity_merge_iob(): + # Test entity IOB stays consistent after merging + words = ["a", "b", "c", "d", "e"] + doc = Doc(Vocab(), words=words) + doc.ents = [(doc.vocab.strings.add('ent-abc'), 0, 3), + (doc.vocab.strings.add('ent-d'), 3, 4)] + assert doc[0].ent_iob_ == "B" + assert doc[1].ent_iob_ == "I" + assert doc[2].ent_iob_ == "I" + assert doc[3].ent_iob_ == "B" + doc[0:1].merge() + assert doc[0].ent_iob_ == "B" + assert doc[1].ent_iob_ == "I" + + def test_spans_sentence_update_after_merge(en_tokenizer): text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale." heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7] From 99fbc7db337ec56c58436f31834e2ee94babe592 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 26 Mar 2018 07:13:34 +0200 Subject: [PATCH 2/5] Improve error message when entity sequence is inconsistent --- spacy/tokens/doc.pyx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 8c5e04ea6..7bed0cd7d 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -421,7 +421,12 @@ cdef class Doc: for i in range(self.length): token = &self.c[i] if token.ent_iob == 1: - assert start != -1 + if start == -1: + seq = ['%s|%s' % (t.text, t.ent_iob_) for t in self[i-5:i+5]] + raise ValueError( + "token.ent_iob values make invalid sequence: " + "I without B\n" + "{seq}".format(seq=' '.join(seq))) elif token.ent_iob == 2 or token.ent_iob == 0: if start != -1: output.append(Span(self, start, i, label=label)) From e807f88410a2bed60779ceb5d8f779dd87625429 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 26 Mar 2018 07:14:16 +0200 Subject: [PATCH 3/5] Resolve merge when cherry-picking ent iob patches from develop --- spacy/tokens/doc.pyx | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 7bed0cd7d..91ab1d8a8 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -451,10 +451,7 @@ cdef class Doc: cdef int i for i in range(self.length): self.c[i].ent_type = 0 - # At this point we don't know whether the NER has run over the - # Doc. If the ent_iob is missing, leave it missing. - if self.c[i].ent_iob != 0: - self.c[i].ent_iob = 2 # Means O. Non-O are set from ents. + self.c[i].ent_iob = 0 # Means missing. cdef attr_t ent_type cdef int start, end for ent_info in ents: From 95fa89c4b8babd0278577b093868f046fc52a26c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 26 Mar 2018 07:14:35 +0200 Subject: [PATCH 4/5] Update doc.ents test --- spacy/tests/doc/test_add_entities.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index cd444ba81..31d2b8420 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -18,7 +18,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab): assert [w.ent_iob_ for w in doc] == (['O'] * len(doc)) doc.ents = [(doc.vocab.strings['ANIMAL'], 3, 4)] - assert [w.ent_iob_ for w in doc] == ['O', 'O', 'O', 'B'] + assert [w.ent_iob_ for w in doc] == ['', '', '', 'B'] doc.ents = [(doc.vocab.strings['WORD'], 0, 2)] - assert [w.ent_iob_ for w in doc] == ['B', 'I', 'O', 'O'] + assert [w.ent_iob_ for w in doc] == ['B', 'I', '', ''] From 0b375d50c8a5a1bb198219d4d5b2cc4f9e038a00 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 26 Mar 2018 07:16:06 +0200 Subject: [PATCH 5/5] Fix ent_iob tags in doc.merge to avoid inconsistent sequences --- spacy/tokens/doc.pyx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 91ab1d8a8..1885dc872 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -949,6 +949,13 @@ cdef class Doc: self.vocab.morphology.assign_tag(token, attr_value) else: Token.set_struct_attr(token, attr_name, attr_value) + # Make sure ent_iob remains consistent + if self.c[end].ent_iob == 1 and token.ent_iob in (0, 2): + if token.ent_type == self.c[end].ent_type: + token.ent_iob = 3 + else: + # If they're not the same entity type, let them be two entities + self.c[end].ent_iob = 3 # Begin by setting all the head indices to absolute token positions # This is easier to work with for now than the offsets # Before thinking of something simpler, beware the case where a