From 0b375d50c8a5a1bb198219d4d5b2cc4f9e038a00 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 26 Mar 2018 07:16:06 +0200 Subject: [PATCH] Fix ent_iob tags in doc.merge to avoid inconsistent sequences --- spacy/tokens/doc.pyx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 91ab1d8a8..1885dc872 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -949,6 +949,13 @@ cdef class Doc: self.vocab.morphology.assign_tag(token, attr_value) else: Token.set_struct_attr(token, attr_name, attr_value) + # Make sure ent_iob remains consistent + if self.c[end].ent_iob == 1 and token.ent_iob in (0, 2): + if token.ent_type == self.c[end].ent_type: + token.ent_iob = 3 + else: + # If they're not the same entity type, let them be two entities + self.c[end].ent_iob = 3 # Begin by setting all the head indices to absolute token positions # This is easier to work with for now than the offsets # Before thinking of something simpler, beware the case where a