diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index b66f8b316..ded4bdc24 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -126,7 +126,7 @@ if __name__ == "__main__": id_to_descr=id_to_descr, doc_cutoff=DOC_CHAR_CUTOFF, dev=False, - limit=10, + limit=100, to_print=False) el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb}) @@ -137,6 +137,8 @@ if __name__ == "__main__": nlp.begin_training() for itn in range(EPOCHS): + print() + print("EPOCH", itn) random.shuffle(train_data) losses = {} batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) @@ -150,15 +152,6 @@ if __name__ == "__main__": ) print("Losses", losses) - ### BELOW CODE IS DEPRECATED ### - - # STEP 6: apply the EL algorithm on the training dataset - TODO deprecated - code moved to pipes.pyx - if run_el_training: - print("STEP 6: training", datetime.datetime.now()) - trainer = EL_Model(kb=my_kb, nlp=nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10000, devlimit=500) - print() - # STEP 7: apply the EL algorithm on the dev dataset (TODO: overlaps with code from run_el_training ?) if apply_to_dev: run_el.run_el_dev(kb=my_kb, nlp=nlp, training_dir=TRAINING_DIR, limit=2000) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index f15ffd036..01302b618 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1125,51 +1125,59 @@ class EntityLinker(Pipe): docs = [docs] golds = [golds] + article_docs = list() + sentence_docs = list() + entity_encodings = list() + for doc, gold in zip(docs, golds): - print("doc", doc) for entity in gold.links: start, end, gold_kb = entity - print("entity", entity) - mention = doc[start:end].text - print("mention", mention) - candidates = self.kb.get_candidates(mention) + mention = doc[start:end] + sentence = mention.sent + + candidates = self.kb.get_candidates(mention.text) for c in candidates: - prior_prob = c.prior_prob kb_id = c.entity_ - print("candidate", kb_id, prior_prob) - entity_encoding = c.entity_vector - print() + # TODO: currently only training on the positive instances + if kb_id == gold_kb: + prior_prob = c.prior_prob + entity_encoding = c.entity_vector - print() + entity_encodings.append(entity_encoding) + article_docs.append(doc) + sentence_docs.append(sentence.as_doc()) - # entity_encodings = None #TODO - # doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop) - # sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop) - # - # concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in - # range(len(article_docs))] - # mention_encodings, bp_cont = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP) - # - # loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None) - # - # mention_gradient = bp_cont(d_scores, sgd=self.sgd_cont) - # - # # gradient : concat (doc+sent) vs. desc - # sent_start = self.article_encoder.nO - # sent_gradients = list() - # doc_gradients = list() - # for x in mention_gradient: - # doc_gradients.append(list(x[0:sent_start])) - # sent_gradients.append(list(x[sent_start:])) - # - # bp_doc(doc_gradients, sgd=self.sgd_article) - # bp_sent(sent_gradients, sgd=self.sgd_sent) - # - # if losses is not None: - # losses.setdefault(self.name, 0.0) - # losses[self.name] += loss - # return loss - return None + if len(entity_encodings) > 0: + doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop) + sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop) + + concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in + range(len(article_docs))] + mention_encodings, bp_mention = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=drop) + + entity_encodings = np.asarray(entity_encodings, dtype=np.float32) + + loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None) + + mention_gradient = bp_mention(d_scores, sgd=self.sgd_mention) + + # gradient : concat (doc+sent) vs. desc + sent_start = self.article_encoder.nO + sent_gradients = list() + doc_gradients = list() + for x in mention_gradient: + doc_gradients.append(list(x[0:sent_start])) + sent_gradients.append(list(x[sent_start:])) + + bp_doc(doc_gradients, sgd=self.sgd_article) + bp_sent(sent_gradients, sgd=self.sgd_sent) + + if losses is not None: + losses.setdefault(self.name, 0.0) + losses[self.name] += loss + return loss + + return 0 def get_loss(self, docs, golds, scores): loss, gradients = get_cossim_loss(scores, golds)