fix WP id parsing, speed up processing and remove ambiguous strings in one doc (for now)

This commit is contained in:
svlandeg 2019-05-03 17:37:47 +02:00
parent 34600c92bd
commit 4e929600e5
1 changed files with 110 additions and 77 deletions

View File

@ -29,7 +29,8 @@ ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab' VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
TRAINING_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/' TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/'
# these will/should be matched ignoring case # these will/should be matched ignoring case
@ -523,17 +524,24 @@ def create_training(kb):
def _read_wikipedia_texts(kb, wp_to_id, limit=None): def _read_wikipedia_texts(kb, wp_to_id, limit=None):
""" Read the XML wikipedia data to parse out training data """ """
Read the XML wikipedia data to parse out training data:
raw text data + positive and negative instances
"""
title_regex = re.compile(r'(?<=<title>).*(?=</title>)') title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)') id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
# read entity training header file read_ids = set()
_write_training_entity(article_id="article_id",
entityfile_loc = TRAINING_OUTPUT_SET_DIR + "/" + "gold_entities.csv"
with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:
# write entity training header file
_write_training_entity(outputfile=entityfile,
article_id="article_id",
alias="alias", alias="alias",
entity="entity", entity="entity",
correct="correct", correct="correct")
append=False)
with bz2.open(ENWIKI_DUMP, mode='rb') as file: with bz2.open(ENWIKI_DUMP, mode='rb') as file:
line = file.readline() line = file.readline()
@ -542,10 +550,17 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None):
article_title = None article_title = None
article_id = None article_id = None
reading_text = False reading_text = False
reading_revision = False
while line and (not limit or cnt < limit): while line and (not limit or cnt < limit):
if cnt % 500000 == 0: if cnt % 500000 == 0:
print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump") print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
clean_line = line.strip().decode("utf-8") clean_line = line.strip().decode("utf-8")
# print(clean_line)
if clean_line == "<revision>":
reading_revision = True
elif clean_line == "</revision>":
reading_revision = False
# Start reading new page # Start reading new page
if clean_line == "<page>": if clean_line == "<page>":
@ -557,11 +572,20 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None):
elif clean_line == "</page>": elif clean_line == "</page>":
if article_id: if article_id:
try: try:
_process_wp_text(kb, wp_to_id, article_id, article_title, article_text.strip()) _process_wp_text(kb, wp_to_id, entityfile, article_id, article_title, article_text.strip())
# on a previous run, an error occurred after 46M lines and 2h # on a previous run, an error occurred after 46M lines and 2h
except Exception as e: except Exception as e:
print("Error processing article", article_id, article_title) print("Error processing article", article_id, article_title)
print(e) print(e)
else:
print("Done processing a page, but couldn't find an article_id ?")
print(article_title)
print(article_text)
article_text = ""
article_title = None
article_id = None
reading_text = False
reading_revision = False
# start reading text within a page # start reading text within a page
if "<text" in clean_line: if "<text" in clean_line:
@ -574,12 +598,17 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None):
if "</text" in clean_line: if "</text" in clean_line:
reading_text = False reading_text = False
# read the ID of this article # read the ID of this article (outside the revision portion of the document)
if not reading_revision:
ids = id_regex.search(clean_line) ids = id_regex.search(clean_line)
if ids: if ids:
article_id = ids[0] article_id = ids[0]
if article_id in read_ids:
print("Found duplicate article ID", article_id, clean_line) # This should never happen ...
read_ids.add(article_id)
# read the title of this article # read the title of this article (outside the revision portion of the document)
if not reading_revision:
titles = title_regex.search(clean_line) titles = title_regex.search(clean_line)
if titles: if titles:
article_title = titles[0].strip() article_title = titles[0].strip()
@ -588,9 +617,11 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None):
cnt += 1 cnt += 1
def _process_wp_text(kb, wp_to_id, article_id, article_title, article_text):
# remove the text tags
text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text)') text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text)')
def _process_wp_text(kb, wp_to_id, entityfile, article_id, article_title, article_text):
# remove the text tags
text = text_regex.search(article_text).group(0) text = text_regex.search(article_text).group(0)
# stop processing if this is a redirect page # stop processing if this is a redirect page
@ -607,12 +638,19 @@ def _process_wp_text(kb, wp_to_id, article_id, article_title, article_text):
# print(clean_text) # print(clean_text)
article_dict = dict() article_dict = dict()
ambiguous_aliases = set()
aliases, entities, normalizations = _get_wp_links(text) aliases, entities, normalizations = _get_wp_links(text)
for alias, entity, norm in zip(aliases, entities, normalizations): for alias, entity, norm in zip(aliases, entities, normalizations):
if alias not in ambiguous_aliases:
entity_id = wp_to_id.get(entity) entity_id = wp_to_id.get(entity)
if entity_id: if entity_id:
# TODO: take care of these conflicts ! Currently they are being removed from the dataset
if article_dict.get(alias) and article_dict[alias] != entity_id:
ambiguous_aliases.add(alias)
article_dict.pop(alias)
# print("Found conflicting alias", alias, "in article", article_id, article_title)
else:
article_dict[alias] = entity_id article_dict[alias] = entity_id
article_dict[entity] = entity_id
# print("found entities:") # print("found entities:")
for alias, entity in article_dict.items(): for alias, entity in article_dict.items():
@ -627,18 +665,18 @@ def _process_wp_text(kb, wp_to_id, article_id, article_title, article_text):
# print all incorrect candidates # print all incorrect candidates
for c in candidates: for c in candidates:
if entity != c.entity_: if entity != c.entity_:
_write_training_entity(article_id=article_id, _write_training_entity(outputfile=entityfile,
article_id=article_id,
alias=alias, alias=alias,
entity=c.entity_, entity=c.entity_,
correct="0", correct="0")
append=True)
# print the one correct candidate # print the one correct candidate
_write_training_entity(article_id=article_id, _write_training_entity(outputfile=entityfile,
article_id=article_id,
alias=alias, alias=alias,
entity=entity, entity=entity,
correct="1", correct="1")
append=True)
# print("gold entity", entity) # print("gold entity", entity)
# print() # print()
@ -720,17 +758,12 @@ def _get_clean_wp_text(article_text):
def _write_training_article(article_id, clean_text): def _write_training_article(article_id, clean_text):
file_loc = TRAINING_SET_DIR + "/" + str(article_id) + ".txt" file_loc = TRAINING_OUTPUT_SET_DIR + "/" + str(article_id) + ".txt"
with open(file_loc, mode='w', encoding='utf8') as outputfile: with open(file_loc, mode='w', encoding='utf8') as outputfile:
outputfile.write(clean_text) outputfile.write(clean_text)
def _write_training_entity(article_id, alias, entity, correct, append=True): def _write_training_entity(outputfile, article_id, alias, entity, correct):
mode = "w"
if append:
mode = "a"
file_loc = TRAINING_SET_DIR + "/" + "gold_entities.csv"
with open(file_loc, mode=mode, encoding='utf8') as outputfile:
outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n") outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n")