mirror of https://github.com/explosion/spaCy.git
167 lines
6.9 KiB
Python
167 lines
6.9 KiB
Python
|
# coding: utf-8
|
||
|
from __future__ import unicode_literals
|
||
|
|
||
|
import re
|
||
|
import bz2
|
||
|
import json
|
||
|
import datetime
|
||
|
|
||
|
# TODO: remove hardcoded paths
|
||
|
WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2'
|
||
|
|
||
|
|
||
|
def read_wikidata_entities_json(limit=None, to_print=False):
|
||
|
""" Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """
|
||
|
|
||
|
languages = {'en', 'de'}
|
||
|
prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected
|
||
|
site_filter = 'enwiki'
|
||
|
|
||
|
title_to_id = dict()
|
||
|
|
||
|
# parse appropriate fields - depending on what we need in the KB
|
||
|
parse_properties = False
|
||
|
parse_sitelinks = True
|
||
|
parse_labels = False
|
||
|
parse_descriptions = False
|
||
|
parse_aliases = False
|
||
|
|
||
|
with bz2.open(WIKIDATA_JSON, mode='rb') as file:
|
||
|
line = file.readline()
|
||
|
cnt = 0
|
||
|
while line and (not limit or cnt < limit):
|
||
|
if cnt % 500000 == 0:
|
||
|
print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
|
||
|
clean_line = line.strip()
|
||
|
if clean_line.endswith(b","):
|
||
|
clean_line = clean_line[:-1]
|
||
|
if len(clean_line) > 1:
|
||
|
obj = json.loads(clean_line)
|
||
|
entry_type = obj["type"]
|
||
|
|
||
|
if entry_type == "item":
|
||
|
# filtering records on their properties
|
||
|
keep = False
|
||
|
|
||
|
claims = obj["claims"]
|
||
|
for prop, value_set in prop_filter.items():
|
||
|
claim_property = claims.get(prop, None)
|
||
|
if claim_property:
|
||
|
for cp in claim_property:
|
||
|
cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id')
|
||
|
cp_rank = cp['rank']
|
||
|
if cp_rank != "deprecated" and cp_id in value_set:
|
||
|
keep = True
|
||
|
|
||
|
if keep:
|
||
|
unique_id = obj["id"]
|
||
|
|
||
|
if to_print:
|
||
|
print("ID:", unique_id)
|
||
|
print("type:", entry_type)
|
||
|
|
||
|
# parsing all properties that refer to other entities
|
||
|
if parse_properties:
|
||
|
for prop, claim_property in claims.items():
|
||
|
cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')]
|
||
|
cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None]
|
||
|
if cp_values:
|
||
|
if to_print:
|
||
|
print("prop:", prop, cp_values)
|
||
|
|
||
|
if parse_sitelinks:
|
||
|
site_value = obj["sitelinks"].get(site_filter, None)
|
||
|
if site_value:
|
||
|
site = site_value['title']
|
||
|
if to_print:
|
||
|
print(site_filter, ":", site)
|
||
|
title_to_id[site] = unique_id
|
||
|
# print(site, "for", unique_id)
|
||
|
|
||
|
if parse_labels:
|
||
|
labels = obj["labels"]
|
||
|
if labels:
|
||
|
for lang in languages:
|
||
|
lang_label = labels.get(lang, None)
|
||
|
if lang_label:
|
||
|
if to_print:
|
||
|
print("label (" + lang + "):", lang_label["value"])
|
||
|
|
||
|
if parse_descriptions:
|
||
|
descriptions = obj["descriptions"]
|
||
|
if descriptions:
|
||
|
for lang in languages:
|
||
|
lang_descr = descriptions.get(lang, None)
|
||
|
if lang_descr:
|
||
|
if to_print:
|
||
|
print("description (" + lang + "):", lang_descr["value"])
|
||
|
|
||
|
if parse_aliases:
|
||
|
aliases = obj["aliases"]
|
||
|
if aliases:
|
||
|
for lang in languages:
|
||
|
lang_aliases = aliases.get(lang, None)
|
||
|
if lang_aliases:
|
||
|
for item in lang_aliases:
|
||
|
if to_print:
|
||
|
print("alias (" + lang + "):", item["value"])
|
||
|
|
||
|
if to_print:
|
||
|
print()
|
||
|
line = file.readline()
|
||
|
cnt += 1
|
||
|
|
||
|
return title_to_id
|
||
|
|
||
|
|
||
|
def _read_wikidata_entities_regex_depr(limit=None):
|
||
|
"""
|
||
|
Read the JSON wiki data and parse out the entities with regular expressions. Takes XXX to parse 55M lines.
|
||
|
TODO: doesn't work yet. may be deleted ?
|
||
|
"""
|
||
|
|
||
|
regex_p31 = re.compile(r'mainsnak[^}]*\"P31\"[^}]*}', re.UNICODE)
|
||
|
regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE)
|
||
|
regex_enwiki = re.compile(r'\"enwiki\":[^}]*}', re.UNICODE)
|
||
|
regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE)
|
||
|
|
||
|
title_to_id = dict()
|
||
|
|
||
|
with bz2.open(WIKIDATA_JSON, mode='rb') as file:
|
||
|
line = file.readline()
|
||
|
cnt = 0
|
||
|
while line and (not limit or cnt < limit):
|
||
|
if cnt % 500000 == 0:
|
||
|
print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
|
||
|
clean_line = line.strip()
|
||
|
if clean_line.endswith(b","):
|
||
|
clean_line = clean_line[:-1]
|
||
|
if len(clean_line) > 1:
|
||
|
clean_line = line.strip().decode("utf-8")
|
||
|
keep = False
|
||
|
|
||
|
p31_matches = regex_p31.findall(clean_line)
|
||
|
if p31_matches:
|
||
|
for p31_match in p31_matches:
|
||
|
id_matches = regex_id.findall(p31_match)
|
||
|
for id_match in id_matches:
|
||
|
id_match = id_match[6:][:-1]
|
||
|
if id_match == "Q5" or id_match == "Q15632617":
|
||
|
keep = True
|
||
|
|
||
|
if keep:
|
||
|
id_match = regex_id.search(clean_line).group(0)
|
||
|
id_match = id_match[6:][:-1]
|
||
|
|
||
|
enwiki_matches = regex_enwiki.findall(clean_line)
|
||
|
if enwiki_matches:
|
||
|
for enwiki_match in enwiki_matches:
|
||
|
title_match = regex_title.search(enwiki_match).group(0)
|
||
|
title = title_match[9:][:-1]
|
||
|
title_to_id[title] = id_match
|
||
|
|
||
|
line = file.readline()
|
||
|
cnt += 1
|
||
|
|
||
|
return title_to_id
|