From 478305cd3f16cbfad2ea6cb9ccf49f434c3395aa Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 18 Jun 2019 18:38:09 +0200 Subject: [PATCH] small tweaks and documentation --- bin/wiki_entity_linking/train_descriptions.py | 5 ++ .../training_set_creator.py | 9 ++- bin/wiki_entity_linking/wikidata_processor.py | 2 +- .../wikipedia_processor.py | 3 +- examples/pipeline/wikidata_entity_linking.py | 14 +++-- spacy/language.py | 2 +- spacy/pipeline/pipes.pyx | 60 ++++++++----------- 7 files changed, 49 insertions(+), 46 deletions(-) diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py index cc5016237..82db582dc 100644 --- a/bin/wiki_entity_linking/train_descriptions.py +++ b/bin/wiki_entity_linking/train_descriptions.py @@ -12,6 +12,10 @@ from thinc.neural._classes.affine import Affine class EntityEncoder: + """ + Train the embeddings of entity descriptions to fit a fixed-size entity vector (e.g. 64D). + This entity vector will be stored in the KB, and context vectors will be trained to be similar to them. + """ DROP = 0 EPOCHS = 5 @@ -102,6 +106,7 @@ class EntityEncoder: def _build_network(self, orig_width, hidden_with): with Model.define_operators({">>": chain}): + # very simple encoder-decoder model self.encoder = ( Affine(hidden_with, orig_width) ) diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py index a0d130824..90df5d9fc 100644 --- a/bin/wiki_entity_linking/training_set_creator.py +++ b/bin/wiki_entity_linking/training_set_creator.py @@ -10,7 +10,8 @@ from spacy.gold import GoldParse from bin.wiki_entity_linking import kb_creator, wikipedia_processor as wp """ -Process Wikipedia interlinks to generate a training dataset for the EL algorithm +Process Wikipedia interlinks to generate a training dataset for the EL algorithm. +Gold-standard entities are stored in one file in standoff format (by character offset). """ # ENTITY_FILE = "gold_entities.csv" @@ -321,12 +322,16 @@ def read_training(nlp, training_dir, dev, limit): current_article_id = article_id ents_by_offset = dict() for ent in current_doc.ents: - ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent + sent_length = len(ent.sent) + # custom filtering to avoid too long or too short sentences + if 5 < sent_length < 100: + ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent else: skip_articles.add(current_article_id) current_doc = None except Exception as e: print("Problem parsing article", article_id, e) + skip_articles.add(current_article_id) # repeat checking this condition in case an exception was thrown if current_doc and (current_article_id == article_id): diff --git a/bin/wiki_entity_linking/wikidata_processor.py b/bin/wiki_entity_linking/wikidata_processor.py index 899c607cc..85d3d8488 100644 --- a/bin/wiki_entity_linking/wikidata_processor.py +++ b/bin/wiki_entity_linking/wikidata_processor.py @@ -10,7 +10,7 @@ WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.js def read_wikidata_entities_json(limit=None, to_print=False): - """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """ + # Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. lang = 'en' site_filter = 'enwiki' diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py index 0747c9db7..d957fc58c 100644 --- a/bin/wiki_entity_linking/wikipedia_processor.py +++ b/bin/wiki_entity_linking/wikipedia_processor.py @@ -8,6 +8,7 @@ import datetime """ Process a Wikipedia dump to calculate entity frequencies and prior probabilities in combination with certain mentions. +Write these results to file for downstream KB and training data generation. """ @@ -142,7 +143,7 @@ def _capitalize_first(text): def write_entity_counts(prior_prob_input, count_output, to_print=False): - """ Write entity counts for quick access later """ + # Write entity counts for quick access later entity_to_count = dict() total_count = 0 diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index d537cce7e..c282c7262 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -195,10 +195,11 @@ def run_pipeline(): print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now()) print() - acc_r, acc_r_by_label, acc_p, acc_p_by_label, acc_o, acc_o_by_label = _measure_baselines(dev_data, kb_2) - print("dev acc oracle:", round(acc_o, 3), [(x, round(y, 3)) for x, y in acc_o_by_label.items()]) - print("dev acc random:", round(acc_r, 3), [(x, round(y, 3)) for x, y in acc_r_by_label.items()]) - print("dev acc prior:", round(acc_p, 3), [(x, round(y, 3)) for x, y in acc_p_by_label.items()]) + counts, acc_r, acc_r_label, acc_p, acc_p_label, acc_o, acc_o_label = _measure_baselines(dev_data, kb_2) + print("dev counts:", sorted(counts)) + print("dev acc oracle:", round(acc_o, 3), [(x, round(y, 3)) for x, y in acc_o_label.items()]) + print("dev acc random:", round(acc_r, 3), [(x, round(y, 3)) for x, y in acc_r_label.items()]) + print("dev acc prior:", round(acc_p, 3), [(x, round(y, 3)) for x, y in acc_p_label.items()]) with el_pipe.model.use_params(optimizer.averages): # measuring combined accuracy (prior + context) @@ -288,6 +289,8 @@ def _measure_accuracy(data, el_pipe): def _measure_baselines(data, kb): # Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound + counts_by_label = dict() + random_correct_by_label = dict() random_incorrect_by_label = dict() @@ -315,6 +318,7 @@ def _measure_baselines(data, kb): # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong' if gold_entity is not None: + counts_by_label[ent_label] = counts_by_label.get(ent_label, 0) + 1 candidates = kb.get_candidates(ent.text) oracle_candidate = "" best_candidate = "" @@ -353,7 +357,7 @@ def _measure_baselines(data, kb): acc_random, acc_random_by_label = calculate_acc(random_correct_by_label, random_incorrect_by_label) acc_oracle, acc_oracle_by_label = calculate_acc(oracle_correct_by_label, oracle_incorrect_by_label) - return acc_random, acc_random_by_label, acc_prior, acc_prior_by_label, acc_oracle, acc_oracle_by_label + return counts_by_label, acc_random, acc_random_by_label, acc_prior, acc_prior_by_label, acc_oracle, acc_oracle_by_label def calculate_acc(correct_by_label, incorrect_by_label): diff --git a/spacy/language.py b/spacy/language.py index 0e5e29244..2225a763e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -11,7 +11,7 @@ from copy import copy, deepcopy from thinc.neural import Model import srsly -from spacy.kb import KnowledgeBase +from .kb import KnowledgeBase from .tokenizer import Tokenizer from .vocab import Vocab from .lemmatizer import Lemmatizer diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 7d90c4438..99c361964 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -14,7 +14,6 @@ from thinc.misc import LayerNorm from thinc.neural.util import to_categorical from thinc.neural.util import get_array_module -from spacy.kb import KnowledgeBase from ..tokens.doc cimport Doc from ..syntax.nn_parser cimport Parser from ..syntax.ner cimport BiluoPushDown @@ -1081,9 +1080,9 @@ class EntityLinker(Pipe): hidden_width = cfg.get("hidden_width", 128) # no default because this needs to correspond with the KB entity length - sent_width = cfg.get("entity_width") + entity_width = cfg.get("entity_width") - model = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=sent_width, **cfg) + model = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=entity_width, **cfg) return model @@ -1135,21 +1134,13 @@ class EntityLinker(Pipe): docs = [docs] golds = [golds] - # article_docs = list() - sentence_docs = list() + context_docs = list() entity_encodings = list() for doc, gold in zip(docs, golds): for entity in gold.links: start, end, gold_kb = entity mention = doc.text[start:end] - sent_start = 0 - sent_end = len(doc) - for index, sent in enumerate(doc.sents): - if start >= sent.start_char and end <= sent.end_char: - sent_start = sent.start - sent_end = sent.end - sentence = doc[sent_start:sent_end].as_doc() candidates = self.kb.get_candidates(mention) for c in candidates: @@ -1159,14 +1150,14 @@ class EntityLinker(Pipe): prior_prob = c.prior_prob entity_encoding = c.entity_vector entity_encodings.append(entity_encoding) - sentence_docs.append(sentence) + context_docs.append(doc) if len(entity_encodings) > 0: - sent_encodings, bp_sent = self.model.begin_update(sentence_docs, drop=drop) + context_encodings, bp_context = self.model.begin_update(context_docs, drop=drop) entity_encodings = np.asarray(entity_encodings, dtype=np.float32) - loss, d_scores = self.get_loss(scores=sent_encodings, golds=entity_encodings, docs=None) - bp_sent(d_scores, sgd=sgd) + loss, d_scores = self.get_loss(scores=context_encodings, golds=entity_encodings, docs=None) + bp_context(d_scores, sgd=sgd) if losses is not None: losses[self.name] += loss @@ -1222,28 +1213,25 @@ class EntityLinker(Pipe): for i, doc in enumerate(docs): if len(doc) > 0: + context_encoding = self.model([doc]) + context_enc_t = np.transpose(context_encoding) for ent in doc.ents: - sent_doc = ent.sent.as_doc() - if len(sent_doc) > 0: - sent_encoding = self.model([sent_doc]) - sent_enc_t = np.transpose(sent_encoding) + candidates = self.kb.get_candidates(ent.text) + if candidates: + scores = list() + for c in candidates: + prior_prob = c.prior_prob * self.prior_weight + kb_id = c.entity_ + entity_encoding = c.entity_vector + sim = float(cosine(np.asarray([entity_encoding]), context_enc_t)) * self.context_weight + score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ? + scores.append(score) - candidates = self.kb.get_candidates(ent.text) - if candidates: - scores = list() - for c in candidates: - prior_prob = c.prior_prob * self.prior_weight - kb_id = c.entity_ - entity_encoding = c.entity_vector - sim = float(cosine(np.asarray([entity_encoding]), sent_enc_t)) * self.context_weight - score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ? - scores.append(score) - - # TODO: thresholding - best_index = scores.index(max(scores)) - best_candidate = candidates[best_index] - final_entities.append(ent) - final_kb_ids.append(best_candidate.entity_) + # TODO: thresholding + best_index = scores.index(max(scores)) + best_candidate = candidates[best_index] + final_entities.append(ent) + final_kb_ids.append(best_candidate.entity_) return final_entities, final_kb_ids