mirror of https://github.com/explosion/spaCy.git
try catch per article to ensure the pipeline goes on
This commit is contained in:
parent
bbcb9da466
commit
34600c92bd
|
@ -537,7 +537,7 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None):
|
|||
|
||||
with bz2.open(ENWIKI_DUMP, mode='rb') as file:
|
||||
line = file.readline()
|
||||
cnt = 1
|
||||
cnt = 0
|
||||
article_text = ""
|
||||
article_title = None
|
||||
article_id = None
|
||||
|
@ -556,7 +556,12 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None):
|
|||
# finished reading this page
|
||||
elif clean_line == "</page>":
|
||||
if article_id:
|
||||
try:
|
||||
_process_wp_text(kb, wp_to_id, article_id, article_title, article_text.strip())
|
||||
# on a previous run, an error occurred after 46M lines and 2h
|
||||
except Exception as e:
|
||||
print("Error processing article", article_id, article_title)
|
||||
print(e)
|
||||
|
||||
# start reading text within a page
|
||||
if "<text" in clean_line:
|
||||
|
@ -585,7 +590,7 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None):
|
|||
|
||||
def _process_wp_text(kb, wp_to_id, article_id, article_title, article_text):
|
||||
# remove the text tags
|
||||
text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text>)')
|
||||
text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text)')
|
||||
text = text_regex.search(article_text).group(0)
|
||||
|
||||
# stop processing if this is a redirect page
|
||||
|
|
Loading…
Reference in New Issue