Bugfix in merge_entities (#6005)

* failing test * bugfix
2020-09-01 21:57:52 +02:00 · 2020-09-01 21:57:52 +02:00 · f7a25d69f7
parent 9002bea29f
commit f7a25d69f7
2 changed files with 35 additions and 2 deletions
--- a/spacy/tests/regression/test_issue5918.py
+++ b/spacy/tests/regression/test_issue5918.py
@ -0,0 +1,31 @@
 # coding: utf8
 from __future__ import unicode_literals
 from spacy.lang.en import English
 from spacy.pipeline import merge_entities, EntityRuler
 def test_issue5918():
    # Test edge case when merging entities.
    nlp = English()
    patterns = [
        {"label": "ORG", "pattern": "Digicon Inc"},
        {"label": "ORG", "pattern": "Rotan Mosle Inc's"},
        {"label": "ORG", "pattern": "Rotan Mosle Technology Partners Ltd"},
    ]
    ruler = EntityRuler(nlp)
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    text = """
        Digicon Inc said it has completed the previously-announced disposition
        of its computer systems division to an investment group led by
        Rotan Mosle Inc's Rotan Mosle Technology Partners Ltd affiliate.
        """
    doc = nlp(text)
    assert len(doc.ents) == 3
    # make it so that the third span's head is within the entity (ent_iob=I)
    # bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents.
    doc[29].head = doc[33]
    doc = merge_entities(doc)
    assert len(doc.ents) == 3
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -175,6 +175,8 @@ def _merge(Doc doc, merges):
        spans.append(span)
        # House the new merged token where it starts
        token = &doc.c[start]
        start_ent_iob = doc.c[start].ent_iob
        start_ent_type = doc.c[start].ent_type
        # Initially set attributes to attributes of span root
        token.tag = doc.c[span.root.i].tag
        token.pos = doc.c[span.root.i].pos
@ -187,8 +189,8 @@ def _merge(Doc doc, merges):
            merged_iob = 3
            # If start token is I-ENT and previous token is of the same
            # type, then I-ENT (could check I-ENT from start to span root)
-            if doc.c[start].ent_iob == 1 and start > 0 \
+            if start_ent_iob == 1 and start > 0 \
-                    and doc.c[start].ent_type == token.ent_type \
+                    and start_ent_type == token.ent_type \
                    and doc.c[start - 1].ent_type == token.ent_type:
                merged_iob = 1
        token.ent_iob = merged_iob