2019-05-06 08:56:56 +00:00
|
|
|
# coding: utf-8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
import bz2
|
|
|
|
import json
|
|
|
|
import datetime
|
|
|
|
|
|
|
|
|
2019-06-19 07:15:43 +00:00
|
|
|
def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False):
|
2019-06-18 16:38:09 +00:00
|
|
|
# Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines.
|
2019-06-19 07:15:43 +00:00
|
|
|
# get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/
|
2019-05-06 08:56:56 +00:00
|
|
|
|
2019-08-13 13:38:59 +00:00
|
|
|
lang = "en"
|
|
|
|
site_filter = "enwiki"
|
2019-05-06 08:56:56 +00:00
|
|
|
|
2019-06-19 07:15:43 +00:00
|
|
|
# properties filter (currently disabled to get ALL data)
|
2019-06-18 11:20:40 +00:00
|
|
|
prop_filter = dict()
|
|
|
|
# prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected
|
|
|
|
|
2019-05-06 08:56:56 +00:00
|
|
|
title_to_id = dict()
|
2019-05-07 14:03:42 +00:00
|
|
|
id_to_descr = dict()
|
2019-05-06 08:56:56 +00:00
|
|
|
|
|
|
|
# parse appropriate fields - depending on what we need in the KB
|
|
|
|
parse_properties = False
|
|
|
|
parse_sitelinks = True
|
|
|
|
parse_labels = False
|
2019-05-07 14:03:42 +00:00
|
|
|
parse_descriptions = True
|
2019-05-06 08:56:56 +00:00
|
|
|
parse_aliases = False
|
2019-06-18 11:20:40 +00:00
|
|
|
parse_claims = False
|
2019-05-06 08:56:56 +00:00
|
|
|
|
2019-08-13 13:38:59 +00:00
|
|
|
with bz2.open(wikidata_file, mode="rb") as file:
|
2019-05-06 08:56:56 +00:00
|
|
|
line = file.readline()
|
|
|
|
cnt = 0
|
|
|
|
while line and (not limit or cnt < limit):
|
2019-08-13 13:38:59 +00:00
|
|
|
if cnt % 1000000 == 0:
|
|
|
|
print(
|
|
|
|
datetime.datetime.now(), "processed", cnt, "lines of WikiData JSON dump"
|
|
|
|
)
|
2019-05-06 08:56:56 +00:00
|
|
|
clean_line = line.strip()
|
|
|
|
if clean_line.endswith(b","):
|
|
|
|
clean_line = clean_line[:-1]
|
|
|
|
if len(clean_line) > 1:
|
|
|
|
obj = json.loads(clean_line)
|
|
|
|
entry_type = obj["type"]
|
|
|
|
|
|
|
|
if entry_type == "item":
|
2019-06-14 17:55:46 +00:00
|
|
|
# filtering records on their properties (currently disabled to get ALL data)
|
|
|
|
# keep = False
|
|
|
|
keep = True
|
2019-05-06 08:56:56 +00:00
|
|
|
|
|
|
|
claims = obj["claims"]
|
2019-06-18 11:20:40 +00:00
|
|
|
if parse_claims:
|
|
|
|
for prop, value_set in prop_filter.items():
|
|
|
|
claim_property = claims.get(prop, None)
|
|
|
|
if claim_property:
|
|
|
|
for cp in claim_property:
|
2019-08-13 13:38:59 +00:00
|
|
|
cp_id = (
|
|
|
|
cp["mainsnak"]
|
|
|
|
.get("datavalue", {})
|
|
|
|
.get("value", {})
|
|
|
|
.get("id")
|
|
|
|
)
|
|
|
|
cp_rank = cp["rank"]
|
2019-06-18 11:20:40 +00:00
|
|
|
if cp_rank != "deprecated" and cp_id in value_set:
|
|
|
|
keep = True
|
2019-05-06 08:56:56 +00:00
|
|
|
|
|
|
|
if keep:
|
|
|
|
unique_id = obj["id"]
|
|
|
|
|
|
|
|
if to_print:
|
|
|
|
print("ID:", unique_id)
|
|
|
|
print("type:", entry_type)
|
|
|
|
|
|
|
|
# parsing all properties that refer to other entities
|
|
|
|
if parse_properties:
|
|
|
|
for prop, claim_property in claims.items():
|
2019-08-13 13:38:59 +00:00
|
|
|
cp_dicts = [
|
|
|
|
cp["mainsnak"]["datavalue"].get("value")
|
|
|
|
for cp in claim_property
|
|
|
|
if cp["mainsnak"].get("datavalue")
|
|
|
|
]
|
|
|
|
cp_values = [
|
|
|
|
cp_dict.get("id")
|
|
|
|
for cp_dict in cp_dicts
|
|
|
|
if isinstance(cp_dict, dict)
|
|
|
|
if cp_dict.get("id") is not None
|
|
|
|
]
|
2019-05-06 08:56:56 +00:00
|
|
|
if cp_values:
|
|
|
|
if to_print:
|
|
|
|
print("prop:", prop, cp_values)
|
|
|
|
|
2019-06-14 17:55:46 +00:00
|
|
|
found_link = False
|
2019-05-06 08:56:56 +00:00
|
|
|
if parse_sitelinks:
|
|
|
|
site_value = obj["sitelinks"].get(site_filter, None)
|
|
|
|
if site_value:
|
2019-08-13 13:38:59 +00:00
|
|
|
site = site_value["title"]
|
2019-05-06 08:56:56 +00:00
|
|
|
if to_print:
|
|
|
|
print(site_filter, ":", site)
|
|
|
|
title_to_id[site] = unique_id
|
2019-06-14 17:55:46 +00:00
|
|
|
found_link = True
|
2019-05-06 08:56:56 +00:00
|
|
|
|
|
|
|
if parse_labels:
|
|
|
|
labels = obj["labels"]
|
|
|
|
if labels:
|
2019-05-07 14:03:42 +00:00
|
|
|
lang_label = labels.get(lang, None)
|
|
|
|
if lang_label:
|
|
|
|
if to_print:
|
2019-08-13 13:38:59 +00:00
|
|
|
print(
|
|
|
|
"label (" + lang + "):", lang_label["value"]
|
|
|
|
)
|
2019-05-06 08:56:56 +00:00
|
|
|
|
2019-06-14 17:55:46 +00:00
|
|
|
if found_link and parse_descriptions:
|
2019-05-06 08:56:56 +00:00
|
|
|
descriptions = obj["descriptions"]
|
|
|
|
if descriptions:
|
2019-05-07 14:03:42 +00:00
|
|
|
lang_descr = descriptions.get(lang, None)
|
|
|
|
if lang_descr:
|
|
|
|
if to_print:
|
2019-08-13 13:38:59 +00:00
|
|
|
print(
|
|
|
|
"description (" + lang + "):",
|
|
|
|
lang_descr["value"],
|
|
|
|
)
|
2019-05-07 14:03:42 +00:00
|
|
|
id_to_descr[unique_id] = lang_descr["value"]
|
2019-05-06 08:56:56 +00:00
|
|
|
|
|
|
|
if parse_aliases:
|
|
|
|
aliases = obj["aliases"]
|
|
|
|
if aliases:
|
2019-05-07 14:03:42 +00:00
|
|
|
lang_aliases = aliases.get(lang, None)
|
|
|
|
if lang_aliases:
|
|
|
|
for item in lang_aliases:
|
|
|
|
if to_print:
|
2019-08-13 13:38:59 +00:00
|
|
|
print(
|
|
|
|
"alias (" + lang + "):", item["value"]
|
|
|
|
)
|
2019-05-06 08:56:56 +00:00
|
|
|
|
|
|
|
if to_print:
|
|
|
|
print()
|
|
|
|
line = file.readline()
|
|
|
|
cnt += 1
|
2019-08-13 13:38:59 +00:00
|
|
|
print(datetime.datetime.now(), "processed", cnt, "lines of WikiData JSON dump")
|
2019-05-06 08:56:56 +00:00
|
|
|
|
2019-05-07 14:03:42 +00:00
|
|
|
return title_to_id, id_to_descr
|