From f5190267e7ba24d395b9933d9fd2ab63c8e5e866 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 3 May 2019 18:09:09 +0200 Subject: [PATCH] run only 100M of WP data as training dataset (9%) --- examples/pipeline/wikidata_entity_linking.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 0db7f4665..4fe97e874 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -520,7 +520,7 @@ def create_training(kb): raise ValueError("kb should be defined") # nlp = spacy.load('en_core_web_sm') wp_to_id = _get_entity_to_id() - _read_wikipedia_texts(kb, wp_to_id, limit=None) + _read_wikipedia_texts(kb, wp_to_id, limit=100000000) # TODO: full dataset def _read_wikipedia_texts(kb, wp_to_id, limit=None): @@ -552,7 +552,7 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None): reading_text = False reading_revision = False while line and (not limit or cnt < limit): - if cnt % 500000 == 0: + if cnt % 1000000 == 0: print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump") clean_line = line.strip().decode("utf-8") # print(clean_line)