# coding: utf-8 from __future__ import unicode_literals import os import re import bz2 import datetime from spacy.gold import GoldParse from . import wikipedia_processor as wp, kb_creator """ Process Wikipedia interlinks to generate a training dataset for the EL algorithm """ # ENTITY_FILE = "gold_entities.csv" ENTITY_FILE = "gold_entities_1000000.csv" # use this file for faster processing def create_training(entity_def_input, training_output): wp_to_id = kb_creator._get_entity_to_id(entity_def_input) _process_wikipedia_texts(wp_to_id, training_output, limit=None) def _process_wikipedia_texts(wp_to_id, training_output, limit=None): """ Read the XML wikipedia data to parse out training data: raw text data + positive instances """ title_regex = re.compile(r'(?<=).*(?=)') id_regex = re.compile(r'(?<=)\d*(?=)') read_ids = set() entityfile_loc = training_output + "/" + ENTITY_FILE with open(entityfile_loc, mode="w", encoding='utf8') as entityfile: # write entity training header file _write_training_entity(outputfile=entityfile, article_id="article_id", alias="alias", entity="WD_id", start="start", end="end") with bz2.open(wp.ENWIKI_DUMP, mode='rb') as file: line = file.readline() cnt = 0 article_text = "" article_title = None article_id = None reading_text = False reading_revision = False while line and (not limit or cnt < limit): if cnt % 1000000 == 0: print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump") clean_line = line.strip().decode("utf-8") if clean_line == "": reading_revision = True elif clean_line == "": reading_revision = False # Start reading new page if clean_line == "": article_text = "" article_title = None article_id = None # finished reading this page elif clean_line == "": if article_id: try: _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_text.strip(), training_output) except Exception as e: print("Error processing article", article_id, article_title, e) else: print("Done processing a page, but couldn't find an article_id ?", article_title) article_text = "" article_title = None article_id = None reading_text = False reading_revision = False # start reading text within a page if ").*(?= 2: reading_special_case = True if open_read == 2 and reading_text: reading_text = False reading_entity = True reading_mention = False # we just finished reading an entity if open_read == 0 and not reading_text: if '#' in entity_buffer or entity_buffer.startswith(':'): reading_special_case = True # Ignore cases with nested structures like File: handles etc if not reading_special_case: if not mention_buffer: mention_buffer = entity_buffer start = len(final_text) end = start + len(mention_buffer) qid = wp_to_id.get(entity_buffer, None) if qid: _write_training_entity(outputfile=entityfile, article_id=article_id, alias=mention_buffer, entity=qid, start=start, end=end) found_entities = True final_text += mention_buffer entity_buffer = "" mention_buffer = "" reading_text = True reading_entity = False reading_mention = False reading_special_case = False if found_entities: _write_training_article(article_id=article_id, clean_text=final_text, training_output=training_output) info_regex = re.compile(r'{[^{]*?}') htlm_regex = re.compile(r'<!--[^-]*-->') category_regex = re.compile(r'\[\[Category:[^\[]*]]') file_regex = re.compile(r'\[\[File:[^[\]]+]]') ref_regex = re.compile(r'<ref.*?>') # non-greedy ref_2_regex = re.compile(r'</ref.*?>') # non-greedy def _get_clean_wp_text(article_text): clean_text = article_text.strip() # remove bolding & italic markup clean_text = clean_text.replace('\'\'\'', '') clean_text = clean_text.replace('\'\'', '') # remove nested {{info}} statements by removing the inner/smallest ones first and iterating try_again = True previous_length = len(clean_text) while try_again: clean_text = info_regex.sub('', clean_text) # non-greedy match excluding a nested { if len(clean_text) < previous_length: try_again = True else: try_again = False previous_length = len(clean_text) # remove HTML comments clean_text = htlm_regex.sub('', clean_text) # remove Category and File statements clean_text = category_regex.sub('', clean_text) clean_text = file_regex.sub('', clean_text) # remove multiple = while '==' in clean_text: clean_text = clean_text.replace("==", "=") clean_text = clean_text.replace(". =", ".") clean_text = clean_text.replace(" = ", ". ") clean_text = clean_text.replace("= ", ".") clean_text = clean_text.replace(" =", "") # remove refs (non-greedy match) clean_text = ref_regex.sub('', clean_text) clean_text = ref_2_regex.sub('', clean_text) # remove additional wikiformatting clean_text = re.sub(r'<blockquote>', '', clean_text) clean_text = re.sub(r'</blockquote>', '', clean_text) # change special characters back to normal ones clean_text = clean_text.replace(r'<', '<') clean_text = clean_text.replace(r'>', '>') clean_text = clean_text.replace(r'"', '"') clean_text = clean_text.replace(r'&nbsp;', ' ') clean_text = clean_text.replace(r'&', '&') # remove multiple spaces while ' ' in clean_text: clean_text = clean_text.replace(' ', ' ') return clean_text.strip() def _write_training_article(article_id, clean_text, training_output): file_loc = training_output + "/" + str(article_id) + ".txt" with open(file_loc, mode='w', encoding='utf8') as outputfile: outputfile.write(clean_text) def _write_training_entity(outputfile, article_id, alias, entity, start, end): outputfile.write(article_id + "|" + alias + "|" + entity + "|" + str(start) + "|" + str(end) + "\n") def is_dev(article_id): return article_id.endswith("3") def read_training(nlp, training_dir, dev, limit): # This method provides training examples that correspond to the entity annotations found by the nlp object entityfile_loc = training_dir + "/" + ENTITY_FILE data = [] # we assume the data is written sequentially current_article_id = None current_doc = None ents_by_offset = dict() skip_articles = set() total_entities = 0 with open(entityfile_loc, mode='r', encoding='utf8') as file: for line in file: if not limit or len(data) < limit: fields = line.replace('\n', "").split(sep='|') article_id = fields[0] alias = fields[1] wp_title = fields[2] start = fields[3] end = fields[4] if dev == is_dev(article_id) and article_id != "article_id" and article_id not in skip_articles: if not current_doc or (current_article_id != article_id): # parse the new article text file_name = article_id + ".txt" try: with open(os.path.join(training_dir, file_name), mode="r", encoding='utf8') as f: text = f.read() if len(text) < 30000: # threshold for convenience / speed of processing current_doc = nlp(text) current_article_id = article_id ents_by_offset = dict() for ent in current_doc.ents: ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent else: skip_articles.add(current_article_id) current_doc = None except Exception as e: print("Problem parsing article", article_id, e) # repeat checking this condition in case an exception was thrown if current_doc and (current_article_id == article_id): found_ent = ents_by_offset.get(start + "_" + end, None) if found_ent: if found_ent.text != alias: skip_articles.add(current_article_id) current_doc = None else: sent = found_ent.sent.as_doc() # currently feeding the gold data one entity per sentence at a time gold_start = int(start) - found_ent.sent.start_char gold_end = int(end) - found_ent.sent.start_char gold_entities = list() gold_entities.append((gold_start, gold_end, wp_title)) gold = GoldParse(doc=current_doc, links=gold_entities) data.append((sent, gold)) total_entities += 1 if len(data) % 500 == 0: print(" -read", total_entities, "entities") print(" -read", total_entities, "entities") return data