diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index cf388773a..a9be49742 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -537,7 +537,7 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None):
with bz2.open(ENWIKI_DUMP, mode='rb') as file:
line = file.readline()
- cnt = 1
+ cnt = 0
article_text = ""
article_title = None
article_id = None
@@ -556,7 +556,12 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None):
# finished reading this page
elif clean_line == "":
if article_id:
- _process_wp_text(kb, wp_to_id, article_id, article_title, article_text.strip())
+ try:
+ _process_wp_text(kb, wp_to_id, article_id, article_title, article_text.strip())
+ # on a previous run, an error occurred after 46M lines and 2h
+ except Exception as e:
+ print("Error processing article", article_id, article_title)
+ print(e)
# start reading text within a page
if ").*(?=)')
+ text_regex = re.compile(r'(?<=).*(?=