diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index b3f42dcc4..06ac8d1d4 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -5,12 +5,14 @@ import os import datetime from os import listdir from random import shuffle +import numpy as np from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator from spacy._ml import SpacyVectors, create_default_optimizer, zero_init from thinc.api import chain, flatten_add_lengths, with_getitem, clone +from thinc.neural.util import get_array_module from thinc.v2v import Model, Softmax, Maxout, Affine, ReLu from thinc.t2v import Pooling, sum_pool, mean_pool from thinc.t2t import ParametricAttention @@ -23,6 +25,11 @@ from spacy.tokens import Doc class EL_Model(): + INPUT_DIM = 300 + OUTPUT_DIM = 5 # 96 + PRINT_LOSS = True + PRINT_F = True + labels = ["MATCH", "NOMATCH"] name = "entity_linker" @@ -31,8 +38,8 @@ class EL_Model(): self.nlp = nlp self.kb = kb - self.entity_encoder = self._simple_encoder(in_width=300, out_width=96) - self.article_encoder = self._simple_encoder(in_width=300, out_width=96) + self.entity_encoder = self._simple_encoder(in_width=self.INPUT_DIM, out_width=self.OUTPUT_DIM) + self.article_encoder = self._simple_encoder(in_width=self.INPUT_DIM, out_width=self.OUTPUT_DIM) def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True): Doc.set_extension("entity_id", default=None) @@ -64,17 +71,20 @@ class EL_Model(): instance_count = 0 for article_id, inst_cluster_set in train_instances.items(): + print("article", article_id) article_doc = train_doc[article_id] pos_ex_list = list() neg_exs_list = list() for inst_cluster in inst_cluster_set: + print("inst_cluster", inst_cluster) instance_count += 1 pos_ex_list.append(train_pos.get(inst_cluster)) neg_exs_list.append(train_neg.get(inst_cluster, [])) self.update(article_doc, pos_ex_list, neg_exs_list, losses=losses) p, r, fscore = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc) - print(round(fscore, 1)) + if self.PRINT_F: + print(round(fscore, 1)) if to_print: print("Trained on", instance_count, "instance clusters") @@ -102,7 +112,7 @@ class EL_Model(): examples.append(pos_ex) shuffle(examples) - best_entity, lowest_mse = self._predict(examples, article_doc) + best_entity, highest_prob = self._predict(examples, article_doc) predictions.append(ex_to_id[best_entity]) golds.append(ex_to_id[pos_ex]) @@ -113,17 +123,21 @@ class EL_Model(): def _predict(self, entities, article_doc): doc_encoding = self.article_encoder([article_doc]) - lowest_mse = None + highest_prob = None best_entity = None + entity_to_vector = dict() for entity in entities: - entity_encoding = self.entity_encoder([entity]) - mse, _ = self._calculate_similarity(doc_encoding, entity_encoding) - if not best_entity or mse < lowest_mse: - lowest_mse = mse + entity_to_vector[entity] = self.entity_encoder([entity]) + + for entity in entities: + entity_encoding = entity_to_vector[entity] + prob = self._calculate_probability(doc_encoding, entity_encoding, entity_to_vector.values()) + if not best_entity or prob > highest_prob: + highest_prob = prob best_entity = entity - return best_entity, lowest_mse + return best_entity, highest_prob def _simple_encoder(self, in_width, out_width): conv_depth = 1 @@ -164,103 +178,56 @@ class EL_Model(): return sgd def update(self, article_doc, true_entity_list, false_entities_list, drop=0., losses=None): - # TODO: one call only to begin_update ? - entity_diffs = None - doc_diffs = None - - doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop) for i, true_entity in enumerate(true_entity_list): - false_entities = false_entities_list[i] + for cnt in range(10): + #try: + false_vectors = list() + false_entities = false_entities_list[i] + if len(false_entities) > 0: + # TODO: batch per doc + doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop) + doc_encoding = doc_encoding[0] + print() + print(cnt) + print("doc", doc_encoding) - true_entity_encoding, true_entity_bp = self.entity_encoder.begin_update([true_entity], drop=drop) - # print("encoding dim", len(true_entity_encoding[0])) + for false_entity in false_entities: + # TODO: one call only to begin_update ? + false_entity_encoding, false_entity_bp = self.entity_encoder.begin_update([false_entity], drop=drop) + false_entity_encoding = false_entity_encoding[0] + false_vectors.append(false_entity_encoding) - consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding) + true_entity_encoding, true_entity_bp = self.entity_encoder.begin_update([true_entity], drop=drop) + true_entity_encoding = true_entity_encoding[0] - doc_mse, doc_diff = self._calculate_similarity(doc_encoding, consensus_encoding) + all_vectors = [true_entity_encoding] + all_vectors.extend(false_vectors) - entity_mses = list() + # consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding) - true_mse, true_diffs = self._calculate_similarity(true_entity_encoding, consensus_encoding) - # print("true_mse", true_mse) - # print("true_diffs", true_diffs) - entity_mses.append(true_mse) - # true_exp = np.exp(true_entity_encoding.dot(consensus_encoding_t)) - # print("true_exp", true_exp) + true_prob = self._calculate_probability(doc_encoding, true_entity_encoding, all_vectors) + print("true", true_prob, true_entity_encoding) - # false_exp_sum = 0 + all_probs = [true_prob] + for false_vector in false_vectors: + false_prob = self._calculate_probability(doc_encoding, false_vector, all_vectors) + print("false", false_prob, false_vector) + all_probs.append(false_prob) - if doc_diffs is not None: - doc_diffs += doc_diff - entity_diffs += true_diffs - else: - doc_diffs = doc_diff - entity_diffs = true_diffs + loss = self._calculate_loss(true_prob, all_probs).astype(np.float32) + if self.PRINT_LOSS: + print("loss", round(loss, 5)) - for false_entity in false_entities: - false_entity_encoding, false_entity_bp = self.entity_encoder.begin_update([false_entity], drop=drop) - false_mse, false_diffs = self._calculate_similarity(false_entity_encoding, consensus_encoding) - # print("false_mse", false_mse) - # false_exp = np.exp(false_entity_encoding.dot(consensus_encoding_t)) - # print("false_exp", false_exp) - # print("false_diffs", false_diffs) - entity_mses.append(false_mse) - # if false_mse > true_mse: - # true_diffs = true_diffs - false_diffs ??? - # false_exp_sum += false_exp - - # prob = true_exp / false_exp_sum - # print("prob", prob) - - entity_mses = sorted(entity_mses) - # mse_sum = sum(entity_mses) - # entity_probs = [1 - x/mse_sum for x in entity_mses] - # print("entity_mses", entity_mses) - # print("entity_probs", entity_probs) - true_index = entity_mses.index(true_mse) - # print("true index", true_index) - # print("true prob", entity_probs[true_index]) - - # print("training loss", true_mse) - - # print() - - # TODO: proper backpropagation taking ranking of elements into account ? - # TODO backpropagation also for negative examples - - if doc_diffs is not None: - doc_diffs = doc_diffs / len(true_entity_list) - - true_entity_bp(entity_diffs, sgd=self.sgd_entity) - article_bp(doc_diffs, sgd=self.sgd_article) + doc_gradient = self._calculate_doc_gradient(loss, doc_encoding, true_entity_encoding, false_vectors) + print("doc_gradient", doc_gradient) + article_bp([doc_gradient.astype(np.float32)], sgd=self.sgd_article) + #except Exception as e: + #pass - # TODO delete ? - def _simple_cnn_model(self, internal_dim): - nr_class = len(self.labels) - with Model.define_operators({">>": chain}): - model_entity = SpacyVectors >> flatten_add_lengths >> Pooling(mean_pool) # entity encoding - model_doc = SpacyVectors >> flatten_add_lengths >> Pooling(mean_pool) # doc encoding - output_layer = Softmax(nr_class, internal_dim*2) - model = (model_entity | model_doc) >> output_layer - # model.tok2vec = chain(tok2vec, flatten) - model.nO = nr_class - return model - - def predict(self, entity_doc, article_doc): - entity_encoding = self.entity_encoder(entity_doc) - doc_encoding = self.article_encoder(article_doc) - - print("entity_encodings", len(entity_encoding), entity_encoding) - print("doc_encodings", len(doc_encoding), doc_encoding) - mse, diffs = self._calculate_similarity(entity_encoding, doc_encoding) - print("mse", mse) - - return mse - - # TODO: expand to more than 2 vectors + # TODO: FIX def _calculate_consensus(self, vector1, vector2): if len(vector1) != len(vector2): raise ValueError("To calculate consenus, both vectors should be of equal length") @@ -268,17 +235,51 @@ class EL_Model(): avg = (vector2 + vector1) / 2 return avg - def _calculate_similarity(self, vector1, vector2): + def _calculate_probability(self, vector1, vector2, allvectors): + """ Make sure that vector2 is included in allvectors """ if len(vector1) != len(vector2): raise ValueError("To calculate similarity, both vectors should be of equal length") - diffs = (vector1 - vector2) - error_sum = (diffs ** 2).sum() - mean_square_error = error_sum / len(vector1) - return float(mean_square_error), diffs + vector1_t = vector1.transpose() + e = self._calculate_dot_exp(vector2, vector1_t) + e_sum = 0 + for v in allvectors: + e_sum += self._calculate_dot_exp(v, vector1_t) - def _get_labels(self): - return tuple(self.labels) + return float(e / e_sum) + + @staticmethod + def _calculate_loss(true_prob, all_probs): + """ all_probs should include true_prob ! """ + return -1 * np.log(true_prob / sum(all_probs)) + + @staticmethod + def _calculate_doc_gradient(loss, doc_vector, true_vector, false_vectors): + gradient = np.zeros(len(doc_vector)) + for i in range(len(doc_vector)): + min_false = min(x[i] for x in false_vectors) + max_false = max(x[i] for x in false_vectors) + + if true_vector[i] > max_false: + if doc_vector[i] > 0: + gradient[i] = 0 + else: + gradient[i] = -loss + elif true_vector[i] < min_false: + if doc_vector[i] > 0: + gradient[i] = loss + if doc_vector[i] < 0: + gradient[i] = 0 + else: + target = 0 # non-distinctive vector positions should convert to 0 + gradient[i] = doc_vector[i] - target + + return gradient + + @staticmethod + def _calculate_dot_exp(vector1, vector2_transposed): + e = np.exp(vector1.dot(vector2_transposed)) + return e def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print): id_to_descr = kb_creator._get_id_to_description(entity_descr_output) diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 43cc41392..bc75ac09a 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training ", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=50, devlimit=50) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1, devlimit=5) print() # STEP 7: apply the EL algorithm on the dev dataset