diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index 11e4cc04c..02106bc31 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
"""Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
"""
+import re
import json
import spacy
import bz2
@@ -11,7 +12,8 @@ from spacy.kb import KnowledgeBase
def create_kb(vocab):
kb = KnowledgeBase(vocab=vocab)
- _read_wikidata()
+ # _read_wikidata()
+ _read_wikipedia()
# adding entities
# kb.add_entity(entity=entity, prob=prob)
@@ -89,6 +91,83 @@ def _read_wikidata():
cnt += 1
+def _read_wikipedia():
+ """ Read the XML wikipedia data """
+ # TODO remove hardcoded path
+
+ # with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2', mode='rb') as file:
+ with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2', mode='rb') as file:
+ line = file.readline()
+ cnt = 1
+ article_text = ""
+ article_title = None
+ article_id = None
+ reading_text = False
+ while line and cnt < 10000:
+ clean_line = line.strip().decode("utf-8")
+
+ # Start reading new page
+ if clean_line == "":
+ article_text = ""
+ article_title = None
+ article_id = 342
+
+ # finished reading this page
+ elif clean_line == "":
+ if article_id:
+ _store_wp_article(article_id, article_title, article_text.strip())
+
+ # start reading text within a page
+ if ")\d*(?=)", clean_line)
+ if ids:
+ article_id = ids[0]
+
+ # read the title of this article
+ titles = re.findall(r"(?<=).*(?=)", clean_line)
+ if titles:
+ article_title = titles[0].strip()
+
+ line = file.readline()
+ cnt += 1
+
+
+def _store_wp_article(article_id, article_title, article_text):
+ print("WP article", article_id, ":", article_title)
+ print(article_text)
+ print(_get_clean_wp_text(article_text))
+ print()
+
+
+def _get_clean_wp_text(article_text):
+ # remove category statements
+ clean_text = re.sub('\[\[Category:.*\]\]', '', article_text)
+
+ # remove nested {{info}} statements by removing the inner/smallest ones first and iterating
+ try_again = True
+ previous_length = len(clean_text)
+ while try_again:
+ clean_text = re.sub('{[^{]*?}', '', clean_text) # non-greedy match
+ print(clean_text)
+ if len(clean_text) < previous_length:
+ try_again = True
+ else:
+ try_again = False
+ previous_length = len(clean_text)
+
+ return clean_text
+
+
def add_el(kb, nlp):
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
nlp.add_pipe(el_pipe, last=True)