mirror of https://github.com/explosion/spaCy.git
Unit test for NEL functionality (#5114)
* empty begin_training for sentencizer * overfitting unit test for entity linker * fixed NEL IO by storing the entity_vector_length in the cfg
This commit is contained in:
parent
3adc511cb0
commit
6ac9fc0619
|
@ -1490,6 +1490,7 @@ class EntityLinker(Pipe):
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple(), **kwargs):
|
def to_disk(self, path, exclude=tuple(), **kwargs):
|
||||||
serialize = {}
|
serialize = {}
|
||||||
|
self.cfg["entity_width"] = self.kb.entity_vector_length
|
||||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||||
serialize["kb"] = lambda p: self.kb.dump(p)
|
serialize["kb"] = lambda p: self.kb.dump(p)
|
||||||
|
@ -1561,6 +1562,11 @@ class Sentencizer(Pipe):
|
||||||
def from_nlp(cls, nlp, model=None, **cfg):
|
def from_nlp(cls, nlp, model=None, **cfg):
|
||||||
return cls(**cfg)
|
return cls(**cfg)
|
||||||
|
|
||||||
|
def begin_training(
|
||||||
|
self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs
|
||||||
|
):
|
||||||
|
pass
|
||||||
|
|
||||||
def __call__(self, example):
|
def __call__(self, example):
|
||||||
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,11 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy.kb import KnowledgeBase
|
from spacy.kb import KnowledgeBase
|
||||||
|
|
||||||
|
from spacy import util
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.pipeline import EntityRuler
|
from spacy.pipeline import EntityRuler
|
||||||
|
from spacy.tests.util import make_tempdir
|
||||||
from spacy.tokens import Span
|
from spacy.tokens import Span
|
||||||
|
|
||||||
|
|
||||||
|
@ -245,3 +248,72 @@ def test_preserving_links_ents_2(nlp):
|
||||||
assert len(list(doc.ents)) == 1
|
assert len(list(doc.ents)) == 1
|
||||||
assert list(doc.ents)[0].label_ == "LOC"
|
assert list(doc.ents)[0].label_ == "LOC"
|
||||||
assert list(doc.ents)[0].kb_id_ == "Q1"
|
assert list(doc.ents)[0].kb_id_ == "Q1"
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
TRAIN_DATA = [
|
||||||
|
("Russ Cochran captured his first major title with his son as caddie.", {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}}),
|
||||||
|
("Russ Cochran his reprints include EC Comics.", {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}),
|
||||||
|
("Russ Cochran has been publishing comic art.", {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}),
|
||||||
|
("Russ Cochran was a member of University of Kentucky's golf team.", {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}}),
|
||||||
|
]
|
||||||
|
GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
def test_overfitting_IO():
|
||||||
|
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
|
||||||
|
nlp = English()
|
||||||
|
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
||||||
|
|
||||||
|
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
|
||||||
|
# Convert the texts to docs to make sure we have doc.ents set for the training examples
|
||||||
|
TRAIN_DOCS = []
|
||||||
|
for text, annotation in TRAIN_DATA:
|
||||||
|
doc = nlp(text)
|
||||||
|
annotation_clean = annotation
|
||||||
|
TRAIN_DOCS.append((doc, annotation_clean))
|
||||||
|
|
||||||
|
# create artificial KB - assign same prior weight to the two russ cochran's
|
||||||
|
# Q2146908 (Russ Cochran): American golfer
|
||||||
|
# Q7381115 (Russ Cochran): publisher
|
||||||
|
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||||
|
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
|
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
||||||
|
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5])
|
||||||
|
|
||||||
|
# Create the Entity Linker component and add it to the pipeline
|
||||||
|
entity_linker = nlp.create_pipe("entity_linker")
|
||||||
|
entity_linker.set_kb(mykb)
|
||||||
|
nlp.add_pipe(entity_linker, last=True)
|
||||||
|
|
||||||
|
# train the NEL pipe
|
||||||
|
optimizer = nlp.begin_training()
|
||||||
|
for i in range(50):
|
||||||
|
losses = {}
|
||||||
|
nlp.update(TRAIN_DOCS, sgd=optimizer, losses=losses)
|
||||||
|
assert losses["entity_linker"] < 0.001
|
||||||
|
|
||||||
|
# test the trained model
|
||||||
|
predictions = []
|
||||||
|
for text, annotation in TRAIN_DATA:
|
||||||
|
doc = nlp(text)
|
||||||
|
for ent in doc.ents:
|
||||||
|
predictions.append(ent.kb_id_)
|
||||||
|
assert predictions == GOLD_entities
|
||||||
|
|
||||||
|
# Also test the results are still the same after IO
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
nlp.to_disk(tmp_dir)
|
||||||
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
|
predictions = []
|
||||||
|
for text, annotation in TRAIN_DATA:
|
||||||
|
doc2 = nlp2(text)
|
||||||
|
for ent in doc2.ents:
|
||||||
|
predictions.append(ent.kb_id_)
|
||||||
|
assert predictions == GOLD_entities
|
||||||
|
|
Loading…
Reference in New Issue