Bugfix in merge_entities (#6005)

* failing test

* bugfix
This commit is contained in:
Sofie Van Landeghem 2020-09-01 21:57:52 +02:00 committed by GitHub
parent 9002bea29f
commit f7a25d69f7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 35 additions and 2 deletions

View File

@ -0,0 +1,31 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.en import English
from spacy.pipeline import merge_entities, EntityRuler
def test_issue5918():
# Test edge case when merging entities.
nlp = English()
patterns = [
{"label": "ORG", "pattern": "Digicon Inc"},
{"label": "ORG", "pattern": "Rotan Mosle Inc's"},
{"label": "ORG", "pattern": "Rotan Mosle Technology Partners Ltd"},
]
ruler = EntityRuler(nlp)
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
text = """
Digicon Inc said it has completed the previously-announced disposition
of its computer systems division to an investment group led by
Rotan Mosle Inc's Rotan Mosle Technology Partners Ltd affiliate.
"""
doc = nlp(text)
assert len(doc.ents) == 3
# make it so that the third span's head is within the entity (ent_iob=I)
# bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents.
doc[29].head = doc[33]
doc = merge_entities(doc)
assert len(doc.ents) == 3

View File

@ -175,6 +175,8 @@ def _merge(Doc doc, merges):
spans.append(span) spans.append(span)
# House the new merged token where it starts # House the new merged token where it starts
token = &doc.c[start] token = &doc.c[start]
start_ent_iob = doc.c[start].ent_iob
start_ent_type = doc.c[start].ent_type
# Initially set attributes to attributes of span root # Initially set attributes to attributes of span root
token.tag = doc.c[span.root.i].tag token.tag = doc.c[span.root.i].tag
token.pos = doc.c[span.root.i].pos token.pos = doc.c[span.root.i].pos
@ -187,8 +189,8 @@ def _merge(Doc doc, merges):
merged_iob = 3 merged_iob = 3
# If start token is I-ENT and previous token is of the same # If start token is I-ENT and previous token is of the same
# type, then I-ENT (could check I-ENT from start to span root) # type, then I-ENT (could check I-ENT from start to span root)
if doc.c[start].ent_iob == 1 and start > 0 \ if start_ent_iob == 1 and start > 0 \
and doc.c[start].ent_type == token.ent_type \ and start_ent_type == token.ent_type \
and doc.c[start - 1].ent_type == token.ent_type: and doc.c[start - 1].ent_type == token.ent_type:
merged_iob = 1 merged_iob = 1
token.ent_iob = merged_iob token.ent_iob = merged_iob