diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index e6df39631..a0ffc3618 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -515,15 +515,12 @@ def add_coref(): def create_training(): nlp = spacy.load('en_core_web_sm') wp_to_id = _get_entity_to_id() - _read_wikipedia(nlp, wp_to_id, limit=10000) + _read_wikipedia_texts(nlp, wp_to_id, limit=10000) -def _read_wikipedia(nlp, wp_to_id, limit=None): +def _read_wikipedia_texts(nlp, wp_to_id, limit=None): """ Read the XML wikipedia data to parse out training data """ - # regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE) - # regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE) - title_regex = re.compile(r'(?<=).*(?=)') id_regex = re.compile(r'(?<=)\d*(?=)') @@ -589,18 +586,15 @@ def _process_wp_text(nlp, wp_to_id, article_id, article_title, article_text): for alias, entity, norm in zip(aliases, entities, normalizations): entity_id = wp_to_id.get(entity) if entity_id: - # print(" ", alias, '-->', entity, '-->', entity_id) article_dict[alias] = entity_id article_dict[entity] = entity_id # get the raw text without markup etc clean_text = _get_clean_wp_text(text) - - #print(text) print(clean_text) - print() _run_ner(nlp, article_id, article_title, clean_text, article_dict) + print() info_regex = re.compile(r'{[^{]*?}') @@ -676,7 +670,15 @@ def _get_clean_wp_text(article_text): def _run_ner(nlp, article_id, article_title, clean_text, article_dict): - pass # TODO + doc = nlp(clean_text) + for ent in doc.ents: + if ent.label_ == "PERSON": # TODO: expand to non-persons + ent_id = article_dict.get(ent.text) + if ent_id: + print(" -", ent.text, ent.label_, ent_id) + else: + print(" -", ent.text, ent.label_, '???') # TODO: investigate these cases + if __name__ == "__main__": print("START", datetime.datetime.now())