From b92f81d5da43da759e44f1e1c6859805873546a4 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 22 Feb 2021 04:49:52 +0100
Subject: [PATCH] fix NEL config and IO, and n_sents functionality (#7100)

* fix NEL config and IO, and n_sents functionality

* add docs

* fix test
---
 spacy/pipeline/entity_linker.py            | 29 +++++++++++-----------
 spacy/tests/pipeline/test_entity_linker.py |  8 ++++++
 spacy/tests/pipeline/test_pipe_methods.py  |  4 +--
 website/docs/api/entitylinker.md           | 19 ++++++++------
 4 files changed, 36 insertions(+), 24 deletions(-)

diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index e25777a21..630057c3f 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -45,6 +45,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
     default_config={
         "model": DEFAULT_NEL_MODEL,
         "labels_discard": [],
+        "n_sents": 0,
         "incl_prior": True,
         "incl_context": True,
         "entity_vector_length": 64,
@@ -62,6 +63,7 @@ def make_entity_linker(
     model: Model,
     *,
     labels_discard: Iterable[str],
+    n_sents: int,
     incl_prior: bool,
     incl_context: bool,
     entity_vector_length: int,
@@ -73,6 +75,7 @@ def make_entity_linker(
         representations. Given a batch of Doc objects, it should return a single
         array, with one row per item in the batch.
     labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
+    n_sents (int): The number of neighbouring sentences to take into account.
     incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
     incl_context (bool): Whether or not to include the local context in the model.
     entity_vector_length (int): Size of encoding vectors in the KB.
@@ -84,6 +87,7 @@ def make_entity_linker(
         model,
         name,
         labels_discard=labels_discard,
+        n_sents=n_sents,
         incl_prior=incl_prior,
         incl_context=incl_context,
         entity_vector_length=entity_vector_length,
@@ -106,6 +110,7 @@ class EntityLinker(TrainablePipe):
         name: str = "entity_linker",
         *,
         labels_discard: Iterable[str],
+        n_sents: int,
         incl_prior: bool,
         incl_context: bool,
         entity_vector_length: int,
@@ -118,6 +123,7 @@ class EntityLinker(TrainablePipe):
         name (str): The component instance name, used to add entries to the
             losses during training.
         labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
+        n_sents (int): The number of neighbouring sentences to take into account.
         incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
         incl_context (bool): Whether or not to include the local context in the model.
         entity_vector_length (int): Size of encoding vectors in the KB.
@@ -129,17 +135,14 @@ class EntityLinker(TrainablePipe):
         self.vocab = vocab
         self.model = model
         self.name = name
-        cfg = {
-            "labels_discard": list(labels_discard),
-            "incl_prior": incl_prior,
-            "incl_context": incl_context,
-            "entity_vector_length": entity_vector_length,
-        }
+        self.labels_discard = list(labels_discard)
+        self.n_sents = n_sents
+        self.incl_prior = incl_prior
+        self.incl_context = incl_context
         self.get_candidates = get_candidates
-        self.cfg = dict(cfg)
+        self.cfg = {}
         self.distance = CosineDistance(normalize=False)
         # how many neightbour sentences to take into account
-        self.n_sents = cfg.get("n_sents", 0)
         # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
         self.kb = empty_kb(entity_vector_length)(self.vocab)
 
@@ -150,7 +153,6 @@ class EntityLinker(TrainablePipe):
             raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))
 
         self.kb = kb_loader(self.vocab)
-        self.cfg["entity_vector_length"] = self.kb.entity_vector_length
 
     def validate_kb(self) -> None:
         # Raise an error if the knowledge base is not initialized.
@@ -312,14 +314,13 @@ class EntityLinker(TrainablePipe):
                         sent_doc = doc[start_token:end_token].as_doc()
                         # currently, the context is the same for each entity in a sentence (should be refined)
                         xp = self.model.ops.xp
-                        if self.cfg.get("incl_context"):
+                        if self.incl_context:
                             sentence_encoding = self.model.predict([sent_doc])[0]
                             sentence_encoding_t = sentence_encoding.T
                             sentence_norm = xp.linalg.norm(sentence_encoding_t)
                         for ent in sent.ents:
                             entity_count += 1
-                            to_discard = self.cfg.get("labels_discard", [])
-                            if to_discard and ent.label_ in to_discard:
+                            if ent.label_ in self.labels_discard:
                                 # ignoring this entity - setting to NIL
                                 final_kb_ids.append(self.NIL)
                             else:
@@ -337,13 +338,13 @@ class EntityLinker(TrainablePipe):
                                     prior_probs = xp.asarray(
                                         [c.prior_prob for c in candidates]
                                     )
-                                    if not self.cfg.get("incl_prior"):
+                                    if not self.incl_prior:
                                         prior_probs = xp.asarray(
                                             [0.0 for _ in candidates]
                                         )
                                     scores = prior_probs
                                     # add in similarity from the context
-                                    if self.cfg.get("incl_context"):
+                                    if self.incl_context:
                                         entity_encodings = xp.asarray(
                                             [c.entity_vector for c in candidates]
                                         )
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 8ba2d0d3e..348298e06 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -250,6 +250,14 @@ def test_el_pipe_configuration(nlp):
     assert doc[2].ent_kb_id_ == "Q2"
 
 
+def test_nel_nsents(nlp):
+    """Test that n_sents can be set through the configuration"""
+    entity_linker = nlp.add_pipe("entity_linker", config={})
+    assert entity_linker.n_sents == 0
+    entity_linker = nlp.replace_pipe("entity_linker", "entity_linker", config={"n_sents": 2})
+    assert entity_linker.n_sents == 2
+
+
 def test_vocab_serialization(nlp):
     """Test that string information is retained across storage"""
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index 6a21ddfaa..9af8395a6 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -83,9 +83,9 @@ def test_replace_last_pipe(nlp):
 def test_replace_pipe_config(nlp):
     nlp.add_pipe("entity_linker")
     nlp.add_pipe("sentencizer")
-    assert nlp.get_pipe("entity_linker").cfg["incl_prior"] is True
+    assert nlp.get_pipe("entity_linker").incl_prior is True
     nlp.replace_pipe("entity_linker", "entity_linker", config={"incl_prior": False})
-    assert nlp.get_pipe("entity_linker").cfg["incl_prior"] is False
+    assert nlp.get_pipe("entity_linker").incl_prior is False
 
 
 @pytest.mark.parametrize("old_name,new_name", [("old_pipe", "new_pipe")])
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index a794ce632..1cc864059 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -31,6 +31,7 @@ architectures and their arguments and hyperparameters.
 > from spacy.pipeline.entity_linker import DEFAULT_NEL_MODEL
 > config = {
 >    "labels_discard": [],
+>    "n_sents": 0,
 >    "incl_prior": True,
 >    "incl_context": True,
 >    "model": DEFAULT_NEL_MODEL,
@@ -43,6 +44,7 @@ architectures and their arguments and hyperparameters.
 | Setting                | Description                                                                                                                                                                                                                                                              |
 | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `labels_discard`       | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                           |
+| `n_sents`              | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                        |
 | `incl_prior`           | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                     |
 | `incl_context`         | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                   |
 | `model`                | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                   |
@@ -89,6 +91,7 @@ custom knowledge base, you should either call
 | `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~                                                                                      |
 | `get_candidates`       | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
 | `labels_discard`       | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~                                                   |
+| `n_sents`              | The number of neighbouring sentences to take into account. ~~int~~                                                               |
 | `incl_prior`           | Whether or not to include prior probabilities from the KB in the model. ~~bool~~                                                 |
 | `incl_context`         | Whether or not to include the local context in the model. ~~bool~~                                                               |
 
@@ -247,14 +250,14 @@ pipe's entity linking model and context encoder. Delegates to
 > losses = entity_linker.update(examples, sgd=optimizer)
 > ```
 
-| Name              | Description                                                                                                                        |
-| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`        | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                  |
-| _keyword-only_    |                                                                                                                                    |
-| `drop`            | The dropout rate. ~~float~~                                                                                                        |
-| `sgd`             | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                      |
-| `losses`          | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~           |
-| **RETURNS**       | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                              |
+| Name           | Description                                                                                                              |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
+| `examples`     | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                        |
+| _keyword-only_ |                                                                                                                          |
+| `drop`         | The dropout rate. ~~float~~                                                                                              |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~            |
+| `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |
 
 ## EntityLinker.score {#score tag="method" new="3"}