From 3163331b1ee4238265e9584247fc36965fb9da13 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sun, 14 Apr 2019 21:52:01 +0200 Subject: [PATCH] wikipedia dump parser and mediawiki format regex cleanup --- examples/pipeline/wikidata_entity_linking.py | 81 +++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 11e4cc04c..02106bc31 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals """Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm. """ +import re import json import spacy import bz2 @@ -11,7 +12,8 @@ from spacy.kb import KnowledgeBase def create_kb(vocab): kb = KnowledgeBase(vocab=vocab) - _read_wikidata() + # _read_wikidata() + _read_wikipedia() # adding entities # kb.add_entity(entity=entity, prob=prob) @@ -89,6 +91,83 @@ def _read_wikidata(): cnt += 1 +def _read_wikipedia(): + """ Read the XML wikipedia data """ + # TODO remove hardcoded path + + # with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2', mode='rb') as file: + with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2', mode='rb') as file: + line = file.readline() + cnt = 1 + article_text = "" + article_title = None + article_id = None + reading_text = False + while line and cnt < 10000: + clean_line = line.strip().decode("utf-8") + + # Start reading new page + if clean_line == "": + article_text = "" + article_title = None + article_id = 342 + + # finished reading this page + elif clean_line == "": + if article_id: + _store_wp_article(article_id, article_title, article_text.strip()) + + # start reading text within a page + if ")\d*(?=)", clean_line) + if ids: + article_id = ids[0] + + # read the title of this article + titles = re.findall(r"(?<=).*(?=)", clean_line) + if titles: + article_title = titles[0].strip() + + line = file.readline() + cnt += 1 + + +def _store_wp_article(article_id, article_title, article_text): + print("WP article", article_id, ":", article_title) + print(article_text) + print(_get_clean_wp_text(article_text)) + print() + + +def _get_clean_wp_text(article_text): + # remove category statements + clean_text = re.sub('\[\[Category:.*\]\]', '', article_text) + + # remove nested {{info}} statements by removing the inner/smallest ones first and iterating + try_again = True + previous_length = len(clean_text) + while try_again: + clean_text = re.sub('{[^{]*?}', '', clean_text) # non-greedy match + print(clean_text) + if len(clean_text) < previous_length: + try_again = True + else: + try_again = False + previous_length = len(clean_text) + + return clean_text + + def add_el(kb, nlp): el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb}) nlp.add_pipe(el_pipe, last=True)