From 478305cd3f16cbfad2ea6cb9ccf49f434c3395aa Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 18 Jun 2019 18:38:09 +0200
Subject: [PATCH] small tweaks and documentation

---
 bin/wiki_entity_linking/train_descriptions.py |  5 ++
 .../training_set_creator.py                   |  9 ++-
 bin/wiki_entity_linking/wikidata_processor.py |  2 +-
 .../wikipedia_processor.py                    |  3 +-
 examples/pipeline/wikidata_entity_linking.py  | 14 +++--
 spacy/language.py                             |  2 +-
 spacy/pipeline/pipes.pyx                      | 60 ++++++++-----------
 7 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py
index cc5016237..82db582dc 100644
--- a/bin/wiki_entity_linking/train_descriptions.py
+++ b/bin/wiki_entity_linking/train_descriptions.py
@@ -12,6 +12,10 @@ from thinc.neural._classes.affine import Affine
 
 
 class EntityEncoder:
+    """
+    Train the embeddings of entity descriptions to fit a fixed-size entity vector (e.g. 64D).
+    This entity vector will be stored in the KB, and context vectors will be trained to be similar to them.
+    """
 
     DROP = 0
     EPOCHS = 5
@@ -102,6 +106,7 @@ class EntityEncoder:
 
     def _build_network(self, orig_width, hidden_with):
         with Model.define_operators({">>": chain}):
+            # very simple encoder-decoder model
             self.encoder = (
                 Affine(hidden_with, orig_width)
             )
diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py
index a0d130824..90df5d9fc 100644
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@@ -10,7 +10,8 @@ from spacy.gold import GoldParse
 from bin.wiki_entity_linking import kb_creator, wikipedia_processor as wp
 
 """
-Process Wikipedia interlinks to generate a training dataset for the EL algorithm
+Process Wikipedia interlinks to generate a training dataset for the EL algorithm.
+Gold-standard entities are stored in one file in standoff format (by character offset).
 """
 
 # ENTITY_FILE = "gold_entities.csv"
@@ -321,12 +322,16 @@ def read_training(nlp, training_dir, dev, limit):
                                     current_article_id = article_id
                                     ents_by_offset = dict()
                                     for ent in current_doc.ents:
-                                        ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent
+                                        sent_length = len(ent.sent)
+                                        # custom filtering to avoid too long or too short sentences
+                                        if 5 < sent_length < 100:
+                                            ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent
                                 else:
                                     skip_articles.add(current_article_id)
                                     current_doc = None
                         except Exception as e:
                             print("Problem parsing article", article_id, e)
+                            skip_articles.add(current_article_id)
 
                     # repeat checking this condition in case an exception was thrown
                     if current_doc and (current_article_id == article_id):
diff --git a/bin/wiki_entity_linking/wikidata_processor.py b/bin/wiki_entity_linking/wikidata_processor.py
index 899c607cc..85d3d8488 100644
--- a/bin/wiki_entity_linking/wikidata_processor.py
+++ b/bin/wiki_entity_linking/wikidata_processor.py
@@ -10,7 +10,7 @@ WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.js
 
 
 def read_wikidata_entities_json(limit=None, to_print=False):
-    """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """
+    # Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines.
 
     lang = 'en'
     site_filter = 'enwiki'
diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py
index 0747c9db7..d957fc58c 100644
--- a/bin/wiki_entity_linking/wikipedia_processor.py
+++ b/bin/wiki_entity_linking/wikipedia_processor.py
@@ -8,6 +8,7 @@ import datetime
 
 """
 Process a Wikipedia dump to calculate entity frequencies and prior probabilities in combination with certain mentions.
+Write these results to file for downstream KB and training data generation.
 """
 
 
@@ -142,7 +143,7 @@ def _capitalize_first(text):
 
 
 def write_entity_counts(prior_prob_input, count_output, to_print=False):
-    """ Write entity counts for quick access later  """
+    # Write entity counts for quick access later
     entity_to_count = dict()
     total_count = 0
 
diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index d537cce7e..c282c7262 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -195,10 +195,11 @@ def run_pipeline():
             print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now())
             print()
 
-            acc_r, acc_r_by_label, acc_p, acc_p_by_label, acc_o, acc_o_by_label = _measure_baselines(dev_data, kb_2)
-            print("dev acc oracle:", round(acc_o, 3), [(x, round(y, 3)) for x, y in acc_o_by_label.items()])
-            print("dev acc random:", round(acc_r, 3), [(x, round(y, 3)) for x, y in acc_r_by_label.items()])
-            print("dev acc prior:", round(acc_p, 3), [(x, round(y, 3)) for x, y in acc_p_by_label.items()])
+            counts, acc_r, acc_r_label, acc_p, acc_p_label, acc_o, acc_o_label = _measure_baselines(dev_data, kb_2)
+            print("dev counts:", sorted(counts))
+            print("dev acc oracle:", round(acc_o, 3), [(x, round(y, 3)) for x, y in acc_o_label.items()])
+            print("dev acc random:", round(acc_r, 3), [(x, round(y, 3)) for x, y in acc_r_label.items()])
+            print("dev acc prior:", round(acc_p, 3), [(x, round(y, 3)) for x, y in acc_p_label.items()])
 
             with el_pipe.model.use_params(optimizer.averages):
                 # measuring combined accuracy (prior + context)
@@ -288,6 +289,8 @@ def _measure_accuracy(data, el_pipe):
 
 def _measure_baselines(data, kb):
     # Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound
+    counts_by_label = dict()
+
     random_correct_by_label = dict()
     random_incorrect_by_label = dict()
 
@@ -315,6 +318,7 @@ def _measure_baselines(data, kb):
 
                 # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
                 if gold_entity is not None:
+                    counts_by_label[ent_label] = counts_by_label.get(ent_label, 0) + 1
                     candidates = kb.get_candidates(ent.text)
                     oracle_candidate = ""
                     best_candidate = ""
@@ -353,7 +357,7 @@ def _measure_baselines(data, kb):
     acc_random, acc_random_by_label = calculate_acc(random_correct_by_label, random_incorrect_by_label)
     acc_oracle, acc_oracle_by_label = calculate_acc(oracle_correct_by_label, oracle_incorrect_by_label)
 
-    return acc_random, acc_random_by_label, acc_prior, acc_prior_by_label, acc_oracle, acc_oracle_by_label
+    return counts_by_label, acc_random, acc_random_by_label, acc_prior, acc_prior_by_label, acc_oracle, acc_oracle_by_label
 
 
 def calculate_acc(correct_by_label, incorrect_by_label):
diff --git a/spacy/language.py b/spacy/language.py
index 0e5e29244..2225a763e 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -11,7 +11,7 @@ from copy import copy, deepcopy
 from thinc.neural import Model
 import srsly
 
-from spacy.kb import KnowledgeBase
+from .kb import KnowledgeBase
 from .tokenizer import Tokenizer
 from .vocab import Vocab
 from .lemmatizer import Lemmatizer
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 7d90c4438..99c361964 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -14,7 +14,6 @@ from thinc.misc import LayerNorm
 from thinc.neural.util import to_categorical
 from thinc.neural.util import get_array_module
 
-from spacy.kb import KnowledgeBase
 from ..tokens.doc cimport Doc
 from ..syntax.nn_parser cimport Parser
 from ..syntax.ner cimport BiluoPushDown
@@ -1081,9 +1080,9 @@ class EntityLinker(Pipe):
         hidden_width = cfg.get("hidden_width", 128)
 
         # no default because this needs to correspond with the KB entity length
-        sent_width = cfg.get("entity_width")
+        entity_width = cfg.get("entity_width")
 
-        model = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=sent_width, **cfg)
+        model = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=entity_width, **cfg)
 
         return model
 
@@ -1135,21 +1134,13 @@ class EntityLinker(Pipe):
             docs = [docs]
             golds = [golds]
 
-        # article_docs = list()
-        sentence_docs = list()
+        context_docs = list()
         entity_encodings = list()
 
         for doc, gold in zip(docs, golds):
             for entity in gold.links:
                 start, end, gold_kb = entity
                 mention = doc.text[start:end]
-                sent_start = 0
-                sent_end = len(doc)
-                for index, sent in enumerate(doc.sents):
-                    if start >= sent.start_char and end <= sent.end_char:
-                        sent_start = sent.start
-                        sent_end = sent.end
-                sentence = doc[sent_start:sent_end].as_doc()
 
                 candidates = self.kb.get_candidates(mention)
                 for c in candidates:
@@ -1159,14 +1150,14 @@ class EntityLinker(Pipe):
                         prior_prob = c.prior_prob
                         entity_encoding = c.entity_vector
                         entity_encodings.append(entity_encoding)
-                        sentence_docs.append(sentence)
+                        context_docs.append(doc)
 
         if len(entity_encodings) > 0:
-            sent_encodings, bp_sent = self.model.begin_update(sentence_docs, drop=drop)
+            context_encodings, bp_context = self.model.begin_update(context_docs, drop=drop)
             entity_encodings = np.asarray(entity_encodings, dtype=np.float32)
 
-            loss, d_scores = self.get_loss(scores=sent_encodings, golds=entity_encodings, docs=None)
-            bp_sent(d_scores, sgd=sgd)
+            loss, d_scores = self.get_loss(scores=context_encodings, golds=entity_encodings, docs=None)
+            bp_context(d_scores, sgd=sgd)
 
             if losses is not None:
                 losses[self.name] += loss
@@ -1222,28 +1213,25 @@ class EntityLinker(Pipe):
 
         for i, doc in enumerate(docs):
             if len(doc) > 0:
+                context_encoding = self.model([doc])
+                context_enc_t = np.transpose(context_encoding)
                 for ent in doc.ents:
-                    sent_doc = ent.sent.as_doc()
-                    if len(sent_doc) > 0:
-                        sent_encoding = self.model([sent_doc])
-                        sent_enc_t = np.transpose(sent_encoding)
+                    candidates = self.kb.get_candidates(ent.text)
+                    if candidates:
+                        scores = list()
+                        for c in candidates:
+                            prior_prob = c.prior_prob * self.prior_weight
+                            kb_id = c.entity_
+                            entity_encoding = c.entity_vector
+                            sim = float(cosine(np.asarray([entity_encoding]), context_enc_t)) * self.context_weight
+                            score = prior_prob + sim - (prior_prob*sim)  # put weights on the different factors ?
+                            scores.append(score)
 
-                        candidates = self.kb.get_candidates(ent.text)
-                        if candidates:
-                            scores = list()
-                            for c in candidates:
-                                prior_prob = c.prior_prob * self.prior_weight
-                                kb_id = c.entity_
-                                entity_encoding = c.entity_vector
-                                sim = float(cosine(np.asarray([entity_encoding]), sent_enc_t)) * self.context_weight
-                                score = prior_prob + sim - (prior_prob*sim)  # put weights on the different factors ?
-                                scores.append(score)
-
-                            # TODO: thresholding
-                            best_index = scores.index(max(scores))
-                            best_candidate = candidates[best_index]
-                            final_entities.append(ent)
-                            final_kb_ids.append(best_candidate.entity_)
+                        # TODO: thresholding
+                        best_index = scores.index(max(scores))
+                        best_candidate = candidates[best_index]
+                        final_entities.append(ent)
+                        final_kb_ids.append(best_candidate.entity_)
 
         return final_entities, final_kb_ids