From 4e929600e53f9650f254c7beb17292fca7a20df5 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 3 May 2019 17:37:47 +0200 Subject: [PATCH] fix WP id parsing, speed up processing and remove ambiguous strings in one doc (for now) --- examples/pipeline/wikidata_entity_linking.py | 187 +++++++++++-------- 1 file changed, 110 insertions(+), 77 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index a9be49742..0db7f4665 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -29,7 +29,8 @@ ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv' KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab' -TRAINING_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/' +TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/' +TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/' # these will/should be matched ignoring case @@ -523,74 +524,104 @@ def create_training(kb): def _read_wikipedia_texts(kb, wp_to_id, limit=None): - """ Read the XML wikipedia data to parse out training data """ + """ + Read the XML wikipedia data to parse out training data: + raw text data + positive and negative instances + """ title_regex = re.compile(r'(?<=).*(?=)') id_regex = re.compile(r'(?<=)\d*(?=)') - # read entity training header file - _write_training_entity(article_id="article_id", - alias="alias", - entity="entity", - correct="correct", - append=False) + read_ids = set() - with bz2.open(ENWIKI_DUMP, mode='rb') as file: - line = file.readline() - cnt = 0 - article_text = "" - article_title = None - article_id = None - reading_text = False - while line and (not limit or cnt < limit): - if cnt % 500000 == 0: - print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump") - clean_line = line.strip().decode("utf-8") - - # Start reading new page - if clean_line == "": - article_text = "" - article_title = None - article_id = None - - # finished reading this page - elif clean_line == "": - if article_id: - try: - _process_wp_text(kb, wp_to_id, article_id, article_title, article_text.strip()) - # on a previous run, an error occurred after 46M lines and 2h - except Exception as e: - print("Error processing article", article_id, article_title) - print(e) - - # start reading text within a page - if "": + reading_revision = True + elif clean_line == "": + reading_revision = False + + # Start reading new page + if clean_line == "": + article_text = "" + article_title = None + article_id = None + + # finished reading this page + elif clean_line == "": + if article_id: + try: + _process_wp_text(kb, wp_to_id, entityfile, article_id, article_title, article_text.strip()) + # on a previous run, an error occurred after 46M lines and 2h + except Exception as e: + print("Error processing article", article_id, article_title) + print(e) + else: + print("Done processing a page, but couldn't find an article_id ?") + print(article_title) + print(article_text) + article_text = "" + article_title = None + article_id = None + reading_text = False + reading_revision = False + + # start reading text within a page + if ").*(?=).*(?=