From f7a25d69f798841fcf54d924a6c84b784b2bc882 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 1 Sep 2020 21:57:52 +0200 Subject: [PATCH] Bugfix in merge_entities (#6005) * failing test * bugfix --- spacy/tests/regression/test_issue5918.py | 31 ++++++++++++++++++++++++ spacy/tokens/_retokenize.pyx | 6 +++-- 2 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 spacy/tests/regression/test_issue5918.py diff --git a/spacy/tests/regression/test_issue5918.py b/spacy/tests/regression/test_issue5918.py new file mode 100644 index 000000000..2dee26d82 --- /dev/null +++ b/spacy/tests/regression/test_issue5918.py @@ -0,0 +1,31 @@ +# coding: utf8 +from __future__ import unicode_literals + +from spacy.lang.en import English +from spacy.pipeline import merge_entities, EntityRuler + + +def test_issue5918(): + # Test edge case when merging entities. + nlp = English() + patterns = [ + {"label": "ORG", "pattern": "Digicon Inc"}, + {"label": "ORG", "pattern": "Rotan Mosle Inc's"}, + {"label": "ORG", "pattern": "Rotan Mosle Technology Partners Ltd"}, + ] + ruler = EntityRuler(nlp) + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + + text = """ + Digicon Inc said it has completed the previously-announced disposition + of its computer systems division to an investment group led by + Rotan Mosle Inc's Rotan Mosle Technology Partners Ltd affiliate. + """ + doc = nlp(text) + assert len(doc.ents) == 3 + # make it so that the third span's head is within the entity (ent_iob=I) + # bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents. + doc[29].head = doc[33] + doc = merge_entities(doc) + assert len(doc.ents) == 3 diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index ce8e510d6..abc9b731b 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -175,6 +175,8 @@ def _merge(Doc doc, merges): spans.append(span) # House the new merged token where it starts token = &doc.c[start] + start_ent_iob = doc.c[start].ent_iob + start_ent_type = doc.c[start].ent_type # Initially set attributes to attributes of span root token.tag = doc.c[span.root.i].tag token.pos = doc.c[span.root.i].pos @@ -187,8 +189,8 @@ def _merge(Doc doc, merges): merged_iob = 3 # If start token is I-ENT and previous token is of the same # type, then I-ENT (could check I-ENT from start to span root) - if doc.c[start].ent_iob == 1 and start > 0 \ - and doc.c[start].ent_type == token.ent_type \ + if start_ent_iob == 1 and start > 0 \ + and start_ent_type == token.ent_type \ and doc.c[start - 1].ent_type == token.ent_type: merged_iob = 1 token.ent_iob = merged_iob