diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index e293be90f..e6df39631 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -10,9 +10,13 @@ import json import spacy import datetime import bz2 + from spacy.kb import KnowledgeBase from spacy.vocab import Vocab +# requires: pip install neuralcoref --no-binary neuralcoref +# import neuralcoref + # TODO: remove hardcoded paths WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2' ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2' @@ -20,6 +24,7 @@ ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-ar PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv' ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv' +ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv' KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab' @@ -43,7 +48,151 @@ wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons", map_alias_to_link = dict() -def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False): +def read_wikipedia_prior_probs(): + """ + STEP 1: Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities + The full file takes about 2h to parse 1100M lines (update printed every 5M lines). + It works relatively fast because we don't care about which article we parsed the interwiki from, + we just process line by line. + """ + + with bz2.open(ENWIKI_DUMP, mode='rb') as file: + line = file.readline() + cnt = 0 + while line: + if cnt % 5000000 == 0: + print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump") + clean_line = line.strip().decode("utf-8") + + aliases, entities, normalizations = _get_wp_links(clean_line) + for alias, entity, norm in zip(aliases, entities, normalizations): + _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True) + _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True) + + line = file.readline() + cnt += 1 + + # write all aliases and their entities and occurrences to file + with open(PRIOR_PROB, mode='w', encoding='utf8') as outputfile: + outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n") + for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]): + for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True): + outputfile.write(alias + "|" + str(count) + "|" + entity + "\n") + + +# find the links +link_regex = re.compile(r'\[\[[^\[\]]*\]\]') + +# match on interwiki links, e.g. `en:` or `:fr:` +ns_regex = r":?" + "[a-z][a-z]" + ":" + +# match on Namespace: optionally preceded by a : +for ns in wiki_namespaces: + ns_regex += "|" + ":?" + ns + ":" + +ns_regex = re.compile(ns_regex, re.IGNORECASE) + + +def _get_wp_links(text): + aliases = [] + entities = [] + normalizations = [] + + matches = link_regex.findall(text) + for match in matches: + match = match[2:][:-2].replace("_", " ").strip() + + if ns_regex.match(match): + pass # ignore namespaces at the beginning of the string + + # this is a simple link, with the alias the same as the mention + elif "|" not in match: + aliases.append(match) + entities.append(match) + normalizations.append(True) + + # in wiki format, the link is written as [[entity|alias]] + else: + splits = match.split("|") + entity = splits[0].strip() + alias = splits[1].strip() + # specific wiki format [[alias (specification)|]] + if len(alias) == 0 and "(" in entity: + alias = entity.split("(")[0] + aliases.append(alias) + entities.append(entity) + normalizations.append(False) + else: + aliases.append(alias) + entities.append(entity) + normalizations.append(False) + + return aliases, entities, normalizations + + +def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True): + alias = alias.strip() + entity = entity.strip() + + # remove everything after # as this is not part of the title but refers to a specific paragraph + if normalize_entity: + # wikipedia titles are always capitalized + entity = _capitalize_first(entity.split("#")[0]) + if normalize_alias: + alias = alias.split("#")[0] + + if alias and entity: + alias_dict = map_alias_to_link.get(alias, dict()) + entity_count = alias_dict.get(entity, 0) + alias_dict[entity] = entity_count + 1 + map_alias_to_link[alias] = alias_dict + + +def _capitalize_first(text): + if not text: + return None + result = text[0].capitalize() + if len(result) > 0: + result += text[1:] + return result + + +def write_entity_counts(to_print=False): + """ STEP 2: write entity counts """ + entity_to_count = dict() + total_count = 0 + + with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file: + # skip header + prior_file.readline() + line = prior_file.readline() + + while line: + splits = line.replace('\n', "").split(sep='|') + # alias = splits[0] + count = int(splits[1]) + entity = splits[2] + + current_count = entity_to_count.get(entity, 0) + entity_to_count[entity] = current_count + count + + total_count += count + + line = prior_file.readline() + + with open(ENTITY_COUNTS, mode='w', encoding='utf8') as entity_file: + entity_file.write("entity" + "|" + "count" + "\n") + for entity, count in entity_to_count.items(): + entity_file.write(entity + "|" + str(count) + "\n") + + if to_print: + for entity, count in entity_to_count.items(): + print("Entity count:", entity, count) + print("Total count:", total_count) + + +def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False, write_entity_defs=True): + """ STEP 3: create the knowledge base """ kb = KnowledgeBase(vocab=vocab) print() @@ -52,6 +201,13 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False): # title_to_id = _read_wikidata_entities_regex_depr(limit=1000) title_to_id = _read_wikidata_entities_json(limit=None) + # write the title-ID mapping to file + if write_entity_defs: + with open(ENTITY_DEFS, mode='w', encoding='utf8') as entity_file: + entity_file.write("WP_title" + "|" + "WD_id" + "\n") + for title, qid in title_to_id.items(): + entity_file.write(title + "|" + str(qid) + "\n") + title_list = list(title_to_id.keys()) entity_list = [title_to_id[x] for x in title_list] @@ -94,37 +250,16 @@ def _get_entity_frequencies(entities): return [entity_to_count.get(e, 0) for e in entities] -def _write_entity_counts(to_print=False): - entity_to_count = dict() - total_count = 0 - - with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file: +def _get_entity_to_id(): + entity_to_id = dict() + with open(ENTITY_DEFS, 'r', encoding='utf8') as csvfile: + csvreader = csv.reader(csvfile, delimiter='|') # skip header - prior_file.readline() - line = prior_file.readline() + next(csvreader) + for row in csvreader: + entity_to_id[row[0]] = row[1] - while line: - splits = line.replace('\n', "").split(sep='|') - # alias = splits[0] - count = int(splits[1]) - entity = splits[2] - - current_count = entity_to_count.get(entity, 0) - entity_to_count[entity] = current_count + count - - total_count += count - - line = prior_file.readline() - - with open(ENTITY_COUNTS, mode='w', encoding='utf8') as entity_file: - entity_file.write("entity" + "|" + "count" + "\n") - for entity, count in entity_to_count.items(): - entity_file.write(entity + "|" + str(count) + "\n") - - if to_print: - for entity, count in entity_to_count.items(): - print("Entity count:", entity, count) - print("Total count:", total_count) + return entity_to_id def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=False): @@ -337,85 +472,60 @@ def _read_wikidata_entities_regex_depr(limit=None, to_print=False): return title_to_id -def _read_wikipedia_prior_probs(): - """ Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities - The full file takes about 2h to parse 1100M lines (update printed every 5M lines) - """ +def test_kb(kb): + # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO + nlp = spacy.load('en_core_web_sm') - # find the links - link_regex = re.compile(r'\[\[[^\[\]]*\]\]') + el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb}) + nlp.add_pipe(el_pipe, last=True) - # match on interwiki links, e.g. `en:` or `:fr:` - ns_regex = r":?" + "[a-z][a-z]" + ":" + candidates = my_kb.get_candidates("Bush") - # match on Namespace: optionally preceded by a : - for ns in wiki_namespaces: - ns_regex += "|" + ":?" + ns + ":" + print("generating candidates for 'Bush' :") + for c in candidates: + print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")") + print() - ns_regex = re.compile(ns_regex, re.IGNORECASE) + text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ + "Douglas reminds us to always bring our towel. " \ + "The main character in Doug's novel is the man Arthur Dent, " \ + "but Douglas doesn't write about George Washington or Homer Simpson." + doc = nlp(text) - with bz2.open(ENWIKI_DUMP, mode='rb') as file: - line = file.readline() - cnt = 0 - while line: - if cnt % 5000000 == 0: - print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump") - clean_line = line.strip().decode("utf-8") - - matches = link_regex.findall(clean_line) - for match in matches: - match = match[2:][:-2].replace("_", " ").strip() - - if ns_regex.match(match): - pass # ignore namespaces at the beginning of the string - - # this is a simple link, with the alias the same as the mention - elif "|" not in match: - _store_alias(match, match, normalize_alias=True, normalize_entity=True) - - # in wiki format, the link is written as [[entity|alias]] - else: - splits = match.split("|") - entity = splits[0].strip() - alias = splits[1].strip() - # specific wiki format [[alias (specification)|]] - if len(alias) == 0 and "(" in entity: - alias = entity.split("(")[0] - _store_alias(alias, entity, normalize_alias=False, normalize_entity=True) - else: - _store_alias(alias, entity, normalize_alias=False, normalize_entity=True) - - line = file.readline() - cnt += 1 - - # write all aliases and their entities and occurrences to file - with open(PRIOR_PROB, mode='w', encoding='utf8') as outputfile: - outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n") - for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]): - for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True): - outputfile.write(alias + "|" + str(count) + "|" + entity + "\n") + for ent in doc.ents: + print("ent", ent.text, ent.label_, ent.kb_id_) -def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True): - alias = alias.strip() - entity = entity.strip() +def add_coref(): + """ STEP 5: add coreference resolution to our model """ + nlp = spacy.load('en_core_web_sm') + # nlp = spacy.load('en') - # remove everything after # as this is not part of the title but refers to a specific paragraph - if normalize_entity: - # wikipedia titles are always capitalized - entity = capitalize_first(entity.split("#")[0]) - if normalize_alias: - alias = alias.split("#")[0] + # TODO: this doesn't work yet + # neuralcoref.add_to_pipe(nlp) + print("done adding to pipe") - if alias and entity: - alias_dict = map_alias_to_link.get(alias, dict()) - entity_count = alias_dict.get(entity, 0) - alias_dict[entity] = entity_count + 1 - map_alias_to_link[alias] = alias_dict + doc = nlp(u'My sister has a dog. She loves him.') + print("done doc") + + print(doc._.has_coref) + print(doc._.coref_clusters) -def _read_wikipedia(): - """ Read the XML wikipedia data """ +def create_training(): + nlp = spacy.load('en_core_web_sm') + wp_to_id = _get_entity_to_id() + _read_wikipedia(nlp, wp_to_id, limit=10000) + + +def _read_wikipedia(nlp, wp_to_id, limit=None): + """ Read the XML wikipedia data to parse out training data """ + + # regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE) + # regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE) + + title_regex = re.compile(r'(?<=