Unit test for NEL functionality (#5114)

* empty begin_training for sentencizer * overfitting unit test for entity linker * fixed NEL IO by storing the entity_vector_length in the cfg
2020-03-06 14:42:23 +01:00 · 2020-03-06 14:42:23 +01:00 · 6ac9fc0619
parent 3adc511cb0
commit 6ac9fc0619
2 changed files with 78 additions and 0 deletions
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -1490,6 +1490,7 @@ class EntityLinker(Pipe):

    def to_disk(self, path, exclude=tuple(), **kwargs):
        serialize = {}
+        self.cfg["entity_width"] = self.kb.entity_vector_length
        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
        serialize["kb"] = lambda p: self.kb.dump(p)
@ -1561,6 +1562,11 @@ class Sentencizer(Pipe):
    def from_nlp(cls, nlp, model=None, **cfg):
        return cls(**cfg)

+    def begin_training(
+        self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs
+    ):
+        pass
+
    def __call__(self, example):
        """Apply the sentencizer to a Doc and set Token.is_sent_start.

--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -1,8 +1,11 @@
 import pytest

 from spacy.kb import KnowledgeBase
+
+from spacy import util
 from spacy.lang.en import English
 from spacy.pipeline import EntityRuler
+from spacy.tests.util import make_tempdir
 from spacy.tokens import Span


@ -245,3 +248,72 @@ def test_preserving_links_ents_2(nlp):
    assert len(list(doc.ents)) == 1
    assert list(doc.ents)[0].label_ == "LOC"
    assert list(doc.ents)[0].kb_id_ == "Q1"
+
+
+# fmt: off
+TRAIN_DATA = [
+    ("Russ Cochran captured his first major title with his son as caddie.", {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}}),
+    ("Russ Cochran his reprints include EC Comics.", {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}),
+    ("Russ Cochran has been publishing comic art.", {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}),
+    ("Russ Cochran was a member of University of Kentucky's golf team.", {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}}),
+]
+GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
+# fmt: on
+
+
+def test_overfitting_IO():
+    # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
+    nlp = English()
+    nlp.add_pipe(nlp.create_pipe('sentencizer'))
+
+    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data
+    ruler = EntityRuler(nlp)
+    patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+
+    # Convert the texts to docs to make sure we have doc.ents set for the training examples
+    TRAIN_DOCS = []
+    for text, annotation in TRAIN_DATA:
+        doc = nlp(text)
+        annotation_clean = annotation
+        TRAIN_DOCS.append((doc, annotation_clean))
+
+    # create artificial KB - assign same prior weight to the two russ cochran's
+    # Q2146908 (Russ Cochran): American golfer
+    # Q7381115 (Russ Cochran): publisher
+    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+    mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
+    mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
+    mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5])
+
+    # Create the Entity Linker component and add it to the pipeline
+    entity_linker = nlp.create_pipe("entity_linker")
+    entity_linker.set_kb(mykb)
+    nlp.add_pipe(entity_linker, last=True)
+
+    # train the NEL pipe
+    optimizer = nlp.begin_training()
+    for i in range(50):
+        losses = {}
+        nlp.update(TRAIN_DOCS, sgd=optimizer, losses=losses)
+    assert losses["entity_linker"] < 0.001
+
+    # test the trained model
+    predictions = []
+    for text, annotation in TRAIN_DATA:
+        doc = nlp(text)
+        for ent in doc.ents:
+            predictions.append(ent.kb_id_)
+    assert predictions == GOLD_entities
+
+    # Also test the results are still the same after IO
+    with make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = util.load_model_from_path(tmp_dir)
+        predictions = []
+        for text, annotation in TRAIN_DATA:
+            doc2 = nlp2(text)
+            for ent in doc2.ents:
+                predictions.append(ent.kb_id_)
+        assert predictions == GOLD_entities