mirror of https://github.com/explosion/spaCy.git
parent
9002bea29f
commit
f7a25d69f7
|
@ -0,0 +1,31 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.pipeline import merge_entities, EntityRuler
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue5918():
|
||||||
|
# Test edge case when merging entities.
|
||||||
|
nlp = English()
|
||||||
|
patterns = [
|
||||||
|
{"label": "ORG", "pattern": "Digicon Inc"},
|
||||||
|
{"label": "ORG", "pattern": "Rotan Mosle Inc's"},
|
||||||
|
{"label": "ORG", "pattern": "Rotan Mosle Technology Partners Ltd"},
|
||||||
|
]
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
|
||||||
|
text = """
|
||||||
|
Digicon Inc said it has completed the previously-announced disposition
|
||||||
|
of its computer systems division to an investment group led by
|
||||||
|
Rotan Mosle Inc's Rotan Mosle Technology Partners Ltd affiliate.
|
||||||
|
"""
|
||||||
|
doc = nlp(text)
|
||||||
|
assert len(doc.ents) == 3
|
||||||
|
# make it so that the third span's head is within the entity (ent_iob=I)
|
||||||
|
# bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents.
|
||||||
|
doc[29].head = doc[33]
|
||||||
|
doc = merge_entities(doc)
|
||||||
|
assert len(doc.ents) == 3
|
|
@ -175,6 +175,8 @@ def _merge(Doc doc, merges):
|
||||||
spans.append(span)
|
spans.append(span)
|
||||||
# House the new merged token where it starts
|
# House the new merged token where it starts
|
||||||
token = &doc.c[start]
|
token = &doc.c[start]
|
||||||
|
start_ent_iob = doc.c[start].ent_iob
|
||||||
|
start_ent_type = doc.c[start].ent_type
|
||||||
# Initially set attributes to attributes of span root
|
# Initially set attributes to attributes of span root
|
||||||
token.tag = doc.c[span.root.i].tag
|
token.tag = doc.c[span.root.i].tag
|
||||||
token.pos = doc.c[span.root.i].pos
|
token.pos = doc.c[span.root.i].pos
|
||||||
|
@ -187,8 +189,8 @@ def _merge(Doc doc, merges):
|
||||||
merged_iob = 3
|
merged_iob = 3
|
||||||
# If start token is I-ENT and previous token is of the same
|
# If start token is I-ENT and previous token is of the same
|
||||||
# type, then I-ENT (could check I-ENT from start to span root)
|
# type, then I-ENT (could check I-ENT from start to span root)
|
||||||
if doc.c[start].ent_iob == 1 and start > 0 \
|
if start_ent_iob == 1 and start > 0 \
|
||||||
and doc.c[start].ent_type == token.ent_type \
|
and start_ent_type == token.ent_type \
|
||||||
and doc.c[start - 1].ent_type == token.ent_type:
|
and doc.c[start - 1].ent_type == token.ent_type:
|
||||||
merged_iob = 1
|
merged_iob = 1
|
||||||
token.ent_iob = merged_iob
|
token.ent_iob = merged_iob
|
||||||
|
|
Loading…
Reference in New Issue