From 653b7d9c87e62c8e37c96f0bac76f5c18ca4889c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 30 Apr 2019 11:39:42 +0200 Subject: [PATCH] calculate entity raw counts offline to speed up KB construction --- examples/pipeline/wikidata_entity_linking.py | 91 ++++++++++++++------ 1 file changed, 64 insertions(+), 27 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 2a544674f..43ba7d8d3 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -1,23 +1,25 @@ # coding: utf-8 from __future__ import unicode_literals -from spacy.vocab import Vocab - """ Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm. """ import re +import csv import json import spacy import datetime import bz2 from spacy.kb import KnowledgeBase +from spacy.vocab import Vocab # TODO: remove hardcoded paths WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2' ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2' ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2' + PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv' +ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv' KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab' @@ -44,18 +46,30 @@ map_alias_to_link = dict() def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False): kb = KnowledgeBase(vocab=vocab) - id_to_title = _read_wikidata_entities(limit=None) - title_to_id = {v: k for k, v in id_to_title.items()} + print() + print("1. _read_wikidata_entities", datetime.datetime.now()) + print() + title_to_id = _read_wikidata_entities(limit=100000) - entity_list = list(id_to_title.keys()) - title_list = [id_to_title[x] for x in entity_list] - entity_frequencies = _get_entity_frequencies(entities=title_list, to_print=False) + title_list = list(title_to_id.keys()) + entity_list = [title_to_id[x] for x in title_list] + print() + print("2. _get_entity_frequencies", datetime.datetime.now()) + print() + entity_frequencies = _get_entity_frequencies(entities=title_list) + + print() + print("3. _add_entities", datetime.datetime.now()) + print() _add_entities(kb, entities=entity_list, probs=entity_frequencies, to_print=to_print) + print() + print("4. _add_aliases", datetime.datetime.now()) + print() _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, @@ -72,15 +86,26 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False): return kb -def _get_entity_frequencies(entities, to_print=False): - count_entities = [0 for _ in entities] +def _get_entity_frequencies(entities): + entity_to_count = dict() + with open(ENTITY_COUNTS, 'r', encoding='utf8') as csvfile: + csvreader = csv.reader(csvfile, delimiter='|') + # skip header + next(csvreader) + for row in csvreader: + entity_to_count[row[0]] = int(row[1]) + + return [entity_to_count.get(e, 0) for e in entities] + + +def _write_entity_counts(to_print=False): + entity_to_count = dict() total_count = 0 with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file: # skip header prior_file.readline() line = prior_file.readline() - # we can read this file sequentially, it's sorted by alias, and then by count while line: splits = line.replace('\n', "").split(sep='|') @@ -88,23 +113,26 @@ def _get_entity_frequencies(entities, to_print=False): count = int(splits[1]) entity = splits[2] - if entity in entities: - index = entities.index(entity) - count_entities[index] = count_entities[index] + count + current_count = entity_to_count.get(entity, 0) + entity_to_count[entity] = current_count + count total_count += count line = prior_file.readline() + with open(ENTITY_COUNTS, mode='w', encoding='utf8') as entity_file: + entity_file.write("entity" + "|" + "count" + "\n") + for entity, count in entity_to_count.items(): + entity_file.write(entity + "|" + str(count) + "\n") + if to_print: - for entity, count in zip(entities, count_entities): + for entity, count in entity_to_count.items(): print("Entity count:", entity, count) print("Total count:", total_count) - return [x*100 / total_count for x in count_entities] - def _add_entities(kb, entities, probs, to_print=False): + # TODO: this should be a bulk method for entity, prob in zip(entities, probs): kb.add_entity(entity=entity, prob=prob) @@ -166,13 +194,13 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=Fals def _read_wikidata_entities(limit=None, to_print=False): - """ Read the JSON wiki data and parse out the entities""" + """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """ languages = {'en', 'de'} prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected site_filter = 'enwiki' - entity_dict = dict() + title_to_id = dict() # parse appropriate fields - depending on what we need in the KB parse_properties = False @@ -192,12 +220,12 @@ def _read_wikidata_entities(limit=None, to_print=False): clean_line = clean_line[:-1] if len(clean_line) > 1: obj = json.loads(clean_line) - unique_id = obj["id"] entry_type = obj["type"] - if unique_id[0] == 'Q' and entry_type == "item": + if entry_type == "item": # filtering records on their properties keep = False + claims = obj["claims"] for prop, value_set in prop_filter.items(): claim_property = claims.get(prop, None) @@ -209,6 +237,8 @@ def _read_wikidata_entities(limit=None, to_print=False): keep = True if keep: + unique_id = obj["id"] + if to_print: print("ID:", unique_id) print("type:", entry_type) @@ -225,9 +255,10 @@ def _read_wikidata_entities(limit=None, to_print=False): if parse_sitelinks: site_value = obj["sitelinks"].get(site_filter, None) if site_value: + site = site_value['title'] if to_print: - print(site_filter, ":", site_value['title']) - entity_dict[unique_id] = site_value['title'] + print(site_filter, ":", site) + title_to_id[site] = unique_id if parse_labels: labels = obj["labels"] @@ -262,7 +293,7 @@ def _read_wikidata_entities(limit=None, to_print=False): line = file.readline() cnt += 1 - return entity_dict + return title_to_id def _read_wikipedia_prior_probs(): @@ -469,6 +500,7 @@ def capitalize_first(text): if __name__ == "__main__": to_create_prior_probs = False + to_create_entity_counts = False to_create_kb = True to_read_kb = False @@ -477,20 +509,25 @@ if __name__ == "__main__": if to_create_prior_probs: _read_wikipedia_prior_probs() + # STEP 2 : deduce entity frequencies from WP + # run only once ! + if to_create_entity_counts: + _write_entity_counts() + if to_create_kb: - # STEP 2 : create KB + # STEP 3 : create KB my_nlp = spacy.load('en_core_web_sm') my_vocab = my_nlp.vocab my_kb = create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, to_print=False) print("kb entities:", my_kb.get_size_entities()) print("kb aliases:", my_kb.get_size_aliases()) - # STEP 3 : write KB to file + # STEP 4 : write KB to file my_kb.dump(KB_FILE) my_vocab.to_disk(VOCAB_DIR) if to_read_kb: - # STEP 4 : read KB back in from file + # STEP 5 : read KB back in from file my_vocab = Vocab() my_vocab.from_disk(VOCAB_DIR) my_kb = KnowledgeBase(vocab=my_vocab) @@ -507,5 +544,5 @@ if __name__ == "__main__": print("alias:", c.alias_) print("prior prob:", c.prior_prob) - # STEP 5: add KB to NLP pipeline + # STEP 6: add KB to NLP pipeline # add_el(my_kb, nlp)