Unit test for NEL functionality (#5114)

* empty begin_training for sentencizer

* overfitting unit test for entity linker

* fixed NEL IO by storing the entity_vector_length in the cfg
This commit is contained in:
Sofie Van Landeghem 2020-03-06 14:42:23 +01:00 committed by GitHub
parent 3adc511cb0
commit 6ac9fc0619
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 78 additions and 0 deletions

View File

@ -1490,6 +1490,7 @@ class EntityLinker(Pipe):
def to_disk(self, path, exclude=tuple(), **kwargs):
serialize = {}
self.cfg["entity_width"] = self.kb.entity_vector_length
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
serialize["kb"] = lambda p: self.kb.dump(p)
@ -1561,6 +1562,11 @@ class Sentencizer(Pipe):
def from_nlp(cls, nlp, model=None, **cfg):
return cls(**cfg)
def begin_training(
self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs
):
pass
def __call__(self, example):
"""Apply the sentencizer to a Doc and set Token.is_sent_start.

View File

@ -1,8 +1,11 @@
import pytest
from spacy.kb import KnowledgeBase
from spacy import util
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from spacy.tests.util import make_tempdir
from spacy.tokens import Span
@ -245,3 +248,72 @@ def test_preserving_links_ents_2(nlp):
assert len(list(doc.ents)) == 1
assert list(doc.ents)[0].label_ == "LOC"
assert list(doc.ents)[0].kb_id_ == "Q1"
# fmt: off
TRAIN_DATA = [
("Russ Cochran captured his first major title with his son as caddie.", {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}}),
("Russ Cochran his reprints include EC Comics.", {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}),
("Russ Cochran has been publishing comic art.", {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}),
("Russ Cochran was a member of University of Kentucky's golf team.", {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}}),
]
GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
# fmt: on
def test_overfitting_IO():
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data
ruler = EntityRuler(nlp)
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
# Convert the texts to docs to make sure we have doc.ents set for the training examples
TRAIN_DOCS = []
for text, annotation in TRAIN_DATA:
doc = nlp(text)
annotation_clean = annotation
TRAIN_DOCS.append((doc, annotation_clean))
# create artificial KB - assign same prior weight to the two russ cochran's
# Q2146908 (Russ Cochran): American golfer
# Q7381115 (Russ Cochran): publisher
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5])
# Create the Entity Linker component and add it to the pipeline
entity_linker = nlp.create_pipe("entity_linker")
entity_linker.set_kb(mykb)
nlp.add_pipe(entity_linker, last=True)
# train the NEL pipe
optimizer = nlp.begin_training()
for i in range(50):
losses = {}
nlp.update(TRAIN_DOCS, sgd=optimizer, losses=losses)
assert losses["entity_linker"] < 0.001
# test the trained model
predictions = []
for text, annotation in TRAIN_DATA:
doc = nlp(text)
for ent in doc.ents:
predictions.append(ent.kb_id_)
assert predictions == GOLD_entities
# Also test the results are still the same after IO
with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir)
predictions = []
for text, annotation in TRAIN_DATA:
doc2 = nlp2(text)
for ent in doc2.ents:
predictions.append(ent.kb_id_)
assert predictions == GOLD_entities