run NER on clean WP text and link to gold-standard entity IDs

This commit is contained in:
svlandeg 2019-05-02 17:24:52 +02:00
parent 581dc9742d
commit cba9680d13
1 changed files with 12 additions and 10 deletions

View File

@ -515,15 +515,12 @@ def add_coref():
def create_training():
nlp = spacy.load('en_core_web_sm')
wp_to_id = _get_entity_to_id()
_read_wikipedia(nlp, wp_to_id, limit=10000)
_read_wikipedia_texts(nlp, wp_to_id, limit=10000)
def _read_wikipedia(nlp, wp_to_id, limit=None):
def _read_wikipedia_texts(nlp, wp_to_id, limit=None):
""" Read the XML wikipedia data to parse out training data """
# regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE)
# regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE)
title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
@ -589,18 +586,15 @@ def _process_wp_text(nlp, wp_to_id, article_id, article_title, article_text):
for alias, entity, norm in zip(aliases, entities, normalizations):
entity_id = wp_to_id.get(entity)
if entity_id:
# print(" ", alias, '-->', entity, '-->', entity_id)
article_dict[alias] = entity_id
article_dict[entity] = entity_id
# get the raw text without markup etc
clean_text = _get_clean_wp_text(text)
#print(text)
print(clean_text)
print()
_run_ner(nlp, article_id, article_title, clean_text, article_dict)
print()
info_regex = re.compile(r'{[^{]*?}')
@ -676,7 +670,15 @@ def _get_clean_wp_text(article_text):
def _run_ner(nlp, article_id, article_title, clean_text, article_dict):
pass # TODO
doc = nlp(clean_text)
for ent in doc.ents:
if ent.label_ == "PERSON": # TODO: expand to non-persons
ent_id = article_dict.get(ent.text)
if ent_id:
print(" -", ent.text, ent.label_, ent_id)
else:
print(" -", ent.text, ent.label_, '???') # TODO: investigate these cases
if __name__ == "__main__":
print("START", datetime.datetime.now())