2019-06-19 11:11:39 +00:00
|
|
|
# coding: utf-8
|
2019-06-24 10:58:18 +00:00
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2019-09-28 16:05:00 +00:00
|
|
|
from ..util import make_tempdir
|
2019-09-29 15:32:12 +00:00
|
|
|
from ...util import ensure_path
|
|
|
|
|
|
|
|
from spacy.kb import KnowledgeBase
|
2019-09-28 16:05:00 +00:00
|
|
|
|
2019-04-24 21:52:34 +00:00
|
|
|
|
|
|
|
def test_serialize_kb_disk(en_vocab):
|
|
|
|
# baseline assertions
|
2019-04-29 15:37:29 +00:00
|
|
|
kb1 = _get_dummy_kb(en_vocab)
|
2019-04-24 21:52:34 +00:00
|
|
|
_check_kb(kb1)
|
|
|
|
|
|
|
|
# dumping to file & loading back in
|
|
|
|
with make_tempdir() as d:
|
|
|
|
dir_path = ensure_path(d)
|
|
|
|
if not dir_path.exists():
|
|
|
|
dir_path.mkdir()
|
|
|
|
file_path = dir_path / "kb"
|
|
|
|
kb1.dump(str(file_path))
|
|
|
|
|
2019-06-05 16:29:18 +00:00
|
|
|
kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3)
|
2019-04-24 21:52:34 +00:00
|
|
|
kb2.load_bulk(str(file_path))
|
|
|
|
|
|
|
|
# final assertions
|
|
|
|
_check_kb(kb2)
|
|
|
|
|
|
|
|
|
2019-04-29 15:37:29 +00:00
|
|
|
def _get_dummy_kb(vocab):
|
2019-06-05 16:29:18 +00:00
|
|
|
kb = KnowledgeBase(vocab=vocab, entity_vector_length=3)
|
|
|
|
|
2019-08-20 15:36:34 +00:00
|
|
|
kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3])
|
|
|
|
kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0])
|
|
|
|
kb.add_entity(entity="Q007", freq=7, entity_vector=[0, 0, 7])
|
|
|
|
kb.add_entity(entity="Q44", freq=342, entity_vector=[4, 4, 4])
|
|
|
|
|
|
|
|
kb.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9])
|
|
|
|
kb.add_alias(
|
|
|
|
alias="guy",
|
|
|
|
entities=["Q53", "Q007", "Q17", "Q44"],
|
|
|
|
probabilities=[0.3, 0.3, 0.2, 0.1],
|
|
|
|
)
|
|
|
|
kb.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])
|
2019-04-29 15:37:29 +00:00
|
|
|
|
|
|
|
return kb
|
|
|
|
|
|
|
|
|
2019-04-24 21:52:34 +00:00
|
|
|
def _check_kb(kb):
|
|
|
|
# check entities
|
|
|
|
assert kb.get_size_entities() == 4
|
2019-08-20 15:36:34 +00:00
|
|
|
for entity_string in ["Q53", "Q17", "Q007", "Q44"]:
|
2019-04-24 21:52:34 +00:00
|
|
|
assert entity_string in kb.get_entity_strings()
|
2019-08-20 15:36:34 +00:00
|
|
|
for entity_string in ["", "Q0"]:
|
2019-04-24 21:52:34 +00:00
|
|
|
assert entity_string not in kb.get_entity_strings()
|
|
|
|
|
|
|
|
# check aliases
|
|
|
|
assert kb.get_size_aliases() == 3
|
2019-08-20 15:36:34 +00:00
|
|
|
for alias_string in ["double07", "guy", "random"]:
|
2019-04-24 21:52:34 +00:00
|
|
|
assert alias_string in kb.get_alias_strings()
|
2019-08-20 15:36:34 +00:00
|
|
|
for alias_string in ["nothingness", "", "randomnoise"]:
|
2019-04-24 21:52:34 +00:00
|
|
|
assert alias_string not in kb.get_alias_strings()
|
|
|
|
|
|
|
|
# check candidates & probabilities
|
2019-08-20 15:36:34 +00:00
|
|
|
candidates = sorted(kb.get_candidates("double07"), key=lambda x: x.entity_)
|
2019-04-24 21:52:34 +00:00
|
|
|
assert len(candidates) == 2
|
|
|
|
|
2019-08-20 15:36:34 +00:00
|
|
|
assert candidates[0].entity_ == "Q007"
|
2019-08-13 13:38:59 +00:00
|
|
|
assert 6.999 < candidates[0].entity_freq < 7.01
|
2019-06-05 16:29:18 +00:00
|
|
|
assert candidates[0].entity_vector == [0, 0, 7]
|
2019-08-20 15:36:34 +00:00
|
|
|
assert candidates[0].alias_ == "double07"
|
2019-04-29 11:58:07 +00:00
|
|
|
assert 0.899 < candidates[0].prior_prob < 0.901
|
2019-04-24 21:52:34 +00:00
|
|
|
|
2019-08-20 15:36:34 +00:00
|
|
|
assert candidates[1].entity_ == "Q17"
|
2019-08-13 13:38:59 +00:00
|
|
|
assert 1.99 < candidates[1].entity_freq < 2.01
|
2019-06-05 16:29:18 +00:00
|
|
|
assert candidates[1].entity_vector == [7, 1, 0]
|
2019-08-20 15:36:34 +00:00
|
|
|
assert candidates[1].alias_ == "double07"
|
2019-04-29 11:58:07 +00:00
|
|
|
assert 0.099 < candidates[1].prior_prob < 0.101
|