From 9f33732b96310dc482097e1a6661415a08acc57a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 7 May 2019 16:03:42 +0200 Subject: [PATCH] using entity descriptions and article texts as input embedding vectors for training --- .../wiki_entity_linking/kb_creator.py | 50 +++++++++-- .../pipeline/wiki_entity_linking/run_el.py | 4 +- .../pipeline/wiki_entity_linking/train_el.py | 58 ++++++++++++ .../training_set_creator.py | 19 +--- .../wiki_entity_linking/wiki_nel_pipeline.py | 34 ++++--- .../wiki_entity_linking/wikidata_processor.py | 90 ++++--------------- 6 files changed, 147 insertions(+), 108 deletions(-) create mode 100644 examples/pipeline/wiki_entity_linking/train_el.py diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py index b9e663bb9..bb00f918d 100644 --- a/examples/pipeline/wiki_entity_linking/kb_creator.py +++ b/examples/pipeline/wiki_entity_linking/kb_creator.py @@ -4,13 +4,16 @@ from __future__ import unicode_literals import spacy from spacy.kb import KnowledgeBase +import csv import datetime from . import wikipedia_processor as wp from . import wikidata_processor as wd -def create_kb(vocab, max_entities_per_alias, min_occ, entity_output, count_input, prior_prob_input, +def create_kb(vocab, max_entities_per_alias, min_occ, + entity_def_output, entity_descr_output, + count_input, prior_prob_input, to_print=False, write_entity_defs=True): """ Create the knowledge base from Wikidata entries """ kb = KnowledgeBase(vocab=vocab) @@ -18,15 +21,11 @@ def create_kb(vocab, max_entities_per_alias, min_occ, entity_output, count_input print() print("1. _read_wikidata_entities", datetime.datetime.now()) print() - # title_to_id = _read_wikidata_entities_regex_depr(limit=1000) - title_to_id = wd.read_wikidata_entities_json(limit=None) + title_to_id, id_to_descr = wd.read_wikidata_entities_json(limit=None) - # write the title-ID mapping to file + # write the title-ID and ID-description mappings to file if write_entity_defs: - with open(entity_output, mode='w', encoding='utf8') as entity_file: - entity_file.write("WP_title" + "|" + "WD_id" + "\n") - for title, qid in title_to_id.items(): - entity_file.write(title + "|" + str(qid) + "\n") + _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_to_descr) title_list = list(title_to_id.keys()) entity_list = [title_to_id[x] for x in title_list] @@ -57,6 +56,41 @@ def create_kb(vocab, max_entities_per_alias, min_occ, entity_output, count_input return kb +def _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_to_descr): + with open(entity_def_output, mode='w', encoding='utf8') as id_file: + id_file.write("WP_title" + "|" + "WD_id" + "\n") + for title, qid in title_to_id.items(): + id_file.write(title + "|" + str(qid) + "\n") + with open(entity_descr_output, mode='w', encoding='utf8') as descr_file: + descr_file.write("WD_id" + "|" + "description" + "\n") + for qid, descr in id_to_descr.items(): + descr_file.write(str(qid) + "|" + descr + "\n") + + +def _get_entity_to_id(entity_def_output): + entity_to_id = dict() + with open(entity_def_output, 'r', encoding='utf8') as csvfile: + csvreader = csv.reader(csvfile, delimiter='|') + # skip header + next(csvreader) + for row in csvreader: + entity_to_id[row[0]] = row[1] + + return entity_to_id + + +def _get_id_to_description(entity_descr_output): + id_to_desc = dict() + with open(entity_descr_output, 'r', encoding='utf8') as csvfile: + csvreader = csv.reader(csvfile, delimiter='|') + # skip header + next(csvreader) + for row in csvreader: + id_to_desc[row[0]] = row[1] + + return id_to_desc + + def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_input, to_print=False): wp_titles = title_to_id.keys() diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py index c2156e31b..96fe58740 100644 --- a/examples/pipeline/wiki_entity_linking/run_el.py +++ b/examples/pipeline/wiki_entity_linking/run_el.py @@ -32,7 +32,7 @@ def run_el_toy_example(nlp, kb): print("ent", ent.text, ent.label_, ent.kb_id_) -def run_el_training(nlp, kb, training_dir, limit=None): +def run_el_dev(nlp, kb, training_dir, limit=None): _prepare_pipeline(nlp, kb) correct_entries_per_article, _ = training_set_creator.read_training_entities(training_output=training_dir, @@ -48,7 +48,7 @@ def run_el_training(nlp, kb, training_dir, limit=None): if is_dev(f): article_id = f.replace(".txt", "") if cnt % 500 == 0: - print(datetime.datetime.now(), "processed", cnt, "files in the training dataset") + print(datetime.datetime.now(), "processed", cnt, "files in the dev dataset") cnt += 1 with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: text = file.read() diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py new file mode 100644 index 000000000..b3ebb658f --- /dev/null +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import os +import datetime +from os import listdir + +from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator +from examples.pipeline.wiki_entity_linking import wikidata_processor as wd + +""" TODO: this code needs to be implemented in pipes.pyx""" + + +def train_model(kb, nlp, training_dir, entity_descr_output, limit=None): + run_el._prepare_pipeline(nlp, kb) + + correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir, + collect_correct=True, + collect_incorrect=True) + + entities = kb.get_entity_strings() + + id_to_descr = kb_creator._get_id_to_description(entity_descr_output) + + cnt = 0 + for f in listdir(training_dir): + if not limit or cnt < limit: + if not run_el.is_dev(f): + article_id = f.replace(".txt", "") + if cnt % 500 == 0: + print(datetime.datetime.now(), "processed", cnt, "files in the dev dataset") + cnt += 1 + with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: + text = file.read() + print() + doc = nlp(text) + doc_vector = doc.vector + print("FILE", f, len(doc_vector), "D vector") + + for mention_pos, entity_pos in correct_entries[article_id].items(): + descr = id_to_descr.get(entity_pos) + if descr: + doc_descr = nlp(descr) + descr_vector = doc_descr.vector + print("GOLD POS", mention_pos, entity_pos, len(descr_vector), "D vector") + + for mention_neg, entity_negs in incorrect_entries[article_id].items(): + for entity_neg in entity_negs: + descr = id_to_descr.get(entity_neg) + if descr: + doc_descr = nlp(descr) + descr_vector = doc_descr.vector + print("GOLD NEG", mention_neg, entity_neg, len(descr_vector), "D vector") + + print() + print("Processed", cnt, "dev articles") + print() + diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py index 47349d3dc..b1c63c55c 100644 --- a/examples/pipeline/wiki_entity_linking/training_set_creator.py +++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py @@ -6,7 +6,7 @@ import csv import bz2 import datetime -from . import wikipedia_processor as wp +from . import wikipedia_processor as wp, kb_creator """ Process Wikipedia interlinks to generate a training dataset for the EL algorithm @@ -14,26 +14,15 @@ Process Wikipedia interlinks to generate a training dataset for the EL algorithm ENTITY_FILE = "gold_entities.csv" -def create_training(kb, entity_input, training_output): + +def create_training(kb, entity_def_input, training_output): if not kb: raise ValueError("kb should be defined") # nlp = spacy.load('en_core_web_sm') - wp_to_id = _get_entity_to_id(entity_input) + wp_to_id = kb_creator._get_entity_to_id(entity_def_input) _process_wikipedia_texts(kb, wp_to_id, training_output, limit=100000000) # TODO: full dataset -def _get_entity_to_id(entity_input): - entity_to_id = dict() - with open(entity_input, 'r', encoding='utf8') as csvfile: - csvreader = csv.reader(csvfile, delimiter='|') - # skip header - next(csvreader) - for row in csvreader: - entity_to_id[row[0]] = row[1] - - return entity_to_id - - def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None): """ Read the XML wikipedia data to parse out training data: diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index ebc1e7958..26e2a7ae2 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -1,7 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el +from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el, train_el import spacy from spacy.vocab import Vocab @@ -15,11 +15,12 @@ Demonstrate how to build a knowledge base from WikiData and run an Entity Linkin PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv' ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv' ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv' +ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv' KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab' -TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/' +TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/' if __name__ == "__main__": @@ -30,17 +31,20 @@ if __name__ == "__main__": # one-time methods to create KB and write to file to_create_prior_probs = False to_create_entity_counts = False - to_create_kb = False + to_create_kb = True # read KB back in from file to_read_kb = True - to_test_kb = False + to_test_kb = True # create training dataset create_wp_training = False - # apply named entity linking to the training dataset - apply_to_training = True + # run training + run_training = False + + # apply named entity linking to the dev dataset + apply_to_dev = False # STEP 1 : create prior probabilities from WP # run only once ! @@ -65,7 +69,8 @@ if __name__ == "__main__": my_kb = kb_creator.create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, - entity_output=ENTITY_DEFS, + entity_def_output=ENTITY_DEFS, + entity_descr_output=ENTITY_DESCR, count_input=ENTITY_COUNTS, prior_prob_input=PRIOR_PROB, to_print=False) @@ -98,12 +103,19 @@ if __name__ == "__main__": # STEP 5: create a training dataset from WP if create_wp_training: print("STEP 5: create training dataset", datetime.datetime.now()) - training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_DIR) + training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) - # STEP 6: apply the EL algorithm on the training dataset - if apply_to_training: + # STEP 7: apply the EL algorithm on the training dataset + if run_training: + print("STEP 6: training ", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_sm') - run_el.run_el_training(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, limit=1000) + train_el.train_model(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, limit=5) + print() + + # STEP 8: apply the EL algorithm on the dev dataset + if apply_to_dev: + my_nlp = spacy.load('en_core_web_sm') + run_el.run_el_dev(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, limit=2000) print() diff --git a/examples/pipeline/wiki_entity_linking/wikidata_processor.py b/examples/pipeline/wiki_entity_linking/wikidata_processor.py index 03db05414..7d84b1a2a 100644 --- a/examples/pipeline/wiki_entity_linking/wikidata_processor.py +++ b/examples/pipeline/wiki_entity_linking/wikidata_processor.py @@ -13,17 +13,18 @@ WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.js def read_wikidata_entities_json(limit=None, to_print=False): """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """ - languages = {'en', 'de'} + lang = 'en' prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected site_filter = 'enwiki' title_to_id = dict() + id_to_descr = dict() # parse appropriate fields - depending on what we need in the KB parse_properties = False parse_sitelinks = True parse_labels = False - parse_descriptions = False + parse_descriptions = True parse_aliases = False with bz2.open(WIKIDATA_JSON, mode='rb') as file: @@ -76,91 +77,36 @@ def read_wikidata_entities_json(limit=None, to_print=False): if to_print: print(site_filter, ":", site) title_to_id[site] = unique_id - # print(site, "for", unique_id) if parse_labels: labels = obj["labels"] if labels: - for lang in languages: - lang_label = labels.get(lang, None) - if lang_label: - if to_print: - print("label (" + lang + "):", lang_label["value"]) + lang_label = labels.get(lang, None) + if lang_label: + if to_print: + print("label (" + lang + "):", lang_label["value"]) if parse_descriptions: descriptions = obj["descriptions"] if descriptions: - for lang in languages: - lang_descr = descriptions.get(lang, None) - if lang_descr: - if to_print: - print("description (" + lang + "):", lang_descr["value"]) + lang_descr = descriptions.get(lang, None) + if lang_descr: + if to_print: + print("description (" + lang + "):", lang_descr["value"]) + id_to_descr[unique_id] = lang_descr["value"] if parse_aliases: aliases = obj["aliases"] if aliases: - for lang in languages: - lang_aliases = aliases.get(lang, None) - if lang_aliases: - for item in lang_aliases: - if to_print: - print("alias (" + lang + "):", item["value"]) + lang_aliases = aliases.get(lang, None) + if lang_aliases: + for item in lang_aliases: + if to_print: + print("alias (" + lang + "):", item["value"]) if to_print: print() line = file.readline() cnt += 1 - return title_to_id - - -def _read_wikidata_entities_regex_depr(limit=None): - """ - Read the JSON wiki data and parse out the entities with regular expressions. Takes XXX to parse 55M lines. - TODO: doesn't work yet. may be deleted ? - """ - - regex_p31 = re.compile(r'mainsnak[^}]*\"P31\"[^}]*}', re.UNICODE) - regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE) - regex_enwiki = re.compile(r'\"enwiki\":[^}]*}', re.UNICODE) - regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE) - - title_to_id = dict() - - with bz2.open(WIKIDATA_JSON, mode='rb') as file: - line = file.readline() - cnt = 0 - while line and (not limit or cnt < limit): - if cnt % 500000 == 0: - print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump") - clean_line = line.strip() - if clean_line.endswith(b","): - clean_line = clean_line[:-1] - if len(clean_line) > 1: - clean_line = line.strip().decode("utf-8") - keep = False - - p31_matches = regex_p31.findall(clean_line) - if p31_matches: - for p31_match in p31_matches: - id_matches = regex_id.findall(p31_match) - for id_match in id_matches: - id_match = id_match[6:][:-1] - if id_match == "Q5" or id_match == "Q15632617": - keep = True - - if keep: - id_match = regex_id.search(clean_line).group(0) - id_match = id_match[6:][:-1] - - enwiki_matches = regex_enwiki.findall(clean_line) - if enwiki_matches: - for enwiki_match in enwiki_matches: - title_match = regex_title.search(enwiki_match).group(0) - title = title_match[9:][:-1] - title_to_id[title] = id_match - - line = file.readline() - cnt += 1 - - return title_to_id + return title_to_id, id_to_descr