mirror of https://github.com/explosion/spaCy.git
run NER on clean WP text and link to gold-standard entity IDs
This commit is contained in:
parent
581dc9742d
commit
cba9680d13
|
@ -515,15 +515,12 @@ def add_coref():
|
||||||
def create_training():
|
def create_training():
|
||||||
nlp = spacy.load('en_core_web_sm')
|
nlp = spacy.load('en_core_web_sm')
|
||||||
wp_to_id = _get_entity_to_id()
|
wp_to_id = _get_entity_to_id()
|
||||||
_read_wikipedia(nlp, wp_to_id, limit=10000)
|
_read_wikipedia_texts(nlp, wp_to_id, limit=10000)
|
||||||
|
|
||||||
|
|
||||||
def _read_wikipedia(nlp, wp_to_id, limit=None):
|
def _read_wikipedia_texts(nlp, wp_to_id, limit=None):
|
||||||
""" Read the XML wikipedia data to parse out training data """
|
""" Read the XML wikipedia data to parse out training data """
|
||||||
|
|
||||||
# regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE)
|
|
||||||
# regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE)
|
|
||||||
|
|
||||||
title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
|
title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
|
||||||
id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
|
id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
|
||||||
|
|
||||||
|
@ -589,18 +586,15 @@ def _process_wp_text(nlp, wp_to_id, article_id, article_title, article_text):
|
||||||
for alias, entity, norm in zip(aliases, entities, normalizations):
|
for alias, entity, norm in zip(aliases, entities, normalizations):
|
||||||
entity_id = wp_to_id.get(entity)
|
entity_id = wp_to_id.get(entity)
|
||||||
if entity_id:
|
if entity_id:
|
||||||
# print(" ", alias, '-->', entity, '-->', entity_id)
|
|
||||||
article_dict[alias] = entity_id
|
article_dict[alias] = entity_id
|
||||||
article_dict[entity] = entity_id
|
article_dict[entity] = entity_id
|
||||||
|
|
||||||
# get the raw text without markup etc
|
# get the raw text without markup etc
|
||||||
clean_text = _get_clean_wp_text(text)
|
clean_text = _get_clean_wp_text(text)
|
||||||
|
|
||||||
#print(text)
|
|
||||||
print(clean_text)
|
print(clean_text)
|
||||||
print()
|
|
||||||
|
|
||||||
_run_ner(nlp, article_id, article_title, clean_text, article_dict)
|
_run_ner(nlp, article_id, article_title, clean_text, article_dict)
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
info_regex = re.compile(r'{[^{]*?}')
|
info_regex = re.compile(r'{[^{]*?}')
|
||||||
|
@ -676,7 +670,15 @@ def _get_clean_wp_text(article_text):
|
||||||
|
|
||||||
|
|
||||||
def _run_ner(nlp, article_id, article_title, clean_text, article_dict):
|
def _run_ner(nlp, article_id, article_title, clean_text, article_dict):
|
||||||
pass # TODO
|
doc = nlp(clean_text)
|
||||||
|
for ent in doc.ents:
|
||||||
|
if ent.label_ == "PERSON": # TODO: expand to non-persons
|
||||||
|
ent_id = article_dict.get(ent.text)
|
||||||
|
if ent_id:
|
||||||
|
print(" -", ent.text, ent.label_, ent_id)
|
||||||
|
else:
|
||||||
|
print(" -", ent.text, ent.label_, '???') # TODO: investigate these cases
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
print("START", datetime.datetime.now())
|
print("START", datetime.datetime.now())
|
||||||
|
|
Loading…
Reference in New Issue