Unit test for NEL functionality (#5114)

* empty begin_training for sentencizer

* overfitting unit test for entity linker

* fixed NEL IO by storing the entity_vector_length in the cfg
This commit is contained in:
Sofie Van Landeghem 2020-03-06 14:42:23 +01:00 committed by GitHub
parent 3adc511cb0
commit 6ac9fc0619
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 78 additions and 0 deletions

View File

@ -1490,6 +1490,7 @@ class EntityLinker(Pipe):
def to_disk(self, path, exclude=tuple(), **kwargs): def to_disk(self, path, exclude=tuple(), **kwargs):
serialize = {} serialize = {}
self.cfg["entity_width"] = self.kb.entity_vector_length
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["vocab"] = lambda p: self.vocab.to_disk(p)
serialize["kb"] = lambda p: self.kb.dump(p) serialize["kb"] = lambda p: self.kb.dump(p)
@ -1561,6 +1562,11 @@ class Sentencizer(Pipe):
def from_nlp(cls, nlp, model=None, **cfg): def from_nlp(cls, nlp, model=None, **cfg):
return cls(**cfg) return cls(**cfg)
def begin_training(
self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs
):
pass
def __call__(self, example): def __call__(self, example):
"""Apply the sentencizer to a Doc and set Token.is_sent_start. """Apply the sentencizer to a Doc and set Token.is_sent_start.

View File

@ -1,8 +1,11 @@
import pytest import pytest
from spacy.kb import KnowledgeBase from spacy.kb import KnowledgeBase
from spacy import util
from spacy.lang.en import English from spacy.lang.en import English
from spacy.pipeline import EntityRuler from spacy.pipeline import EntityRuler
from spacy.tests.util import make_tempdir
from spacy.tokens import Span from spacy.tokens import Span
@ -245,3 +248,72 @@ def test_preserving_links_ents_2(nlp):
assert len(list(doc.ents)) == 1 assert len(list(doc.ents)) == 1
assert list(doc.ents)[0].label_ == "LOC" assert list(doc.ents)[0].label_ == "LOC"
assert list(doc.ents)[0].kb_id_ == "Q1" assert list(doc.ents)[0].kb_id_ == "Q1"
# fmt: off
TRAIN_DATA = [
("Russ Cochran captured his first major title with his son as caddie.", {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}}),
("Russ Cochran his reprints include EC Comics.", {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}),
("Russ Cochran has been publishing comic art.", {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}),
("Russ Cochran was a member of University of Kentucky's golf team.", {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}}),
]
GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
# fmt: on
def test_overfitting_IO():
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data
ruler = EntityRuler(nlp)
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
# Convert the texts to docs to make sure we have doc.ents set for the training examples
TRAIN_DOCS = []
for text, annotation in TRAIN_DATA:
doc = nlp(text)
annotation_clean = annotation
TRAIN_DOCS.append((doc, annotation_clean))
# create artificial KB - assign same prior weight to the two russ cochran's
# Q2146908 (Russ Cochran): American golfer
# Q7381115 (Russ Cochran): publisher
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5])
# Create the Entity Linker component and add it to the pipeline
entity_linker = nlp.create_pipe("entity_linker")
entity_linker.set_kb(mykb)
nlp.add_pipe(entity_linker, last=True)
# train the NEL pipe
optimizer = nlp.begin_training()
for i in range(50):
losses = {}
nlp.update(TRAIN_DOCS, sgd=optimizer, losses=losses)
assert losses["entity_linker"] < 0.001
# test the trained model
predictions = []
for text, annotation in TRAIN_DATA:
doc = nlp(text)
for ent in doc.ents:
predictions.append(ent.kb_id_)
assert predictions == GOLD_entities
# Also test the results are still the same after IO
with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir)
predictions = []
for text, annotation in TRAIN_DATA:
doc2 = nlp2(text)
for ent in doc2.ents:
predictions.append(ent.kb_id_)
assert predictions == GOLD_entities