From cf341132504a0edd157f65390a1a33400d9e8337 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 18 Mar 2019 17:27:51 +0100 Subject: [PATCH] very minimal KB functionality working --- setup.py | 1 + spacy/kb.pxd | 17 ++++++++--- spacy/kb.pyx | 30 ++++++++++++------- .../sandbox_test_sofie}/__init__.py | 0 .../sandbox_test_sofie}/testing_el.py | 15 +++++++++- 5 files changed, 47 insertions(+), 16 deletions(-) rename {sandbox_test_sofie => spacy/sandbox_test_sofie}/__init__.py (100%) rename {sandbox_test_sofie => spacy/sandbox_test_sofie}/testing_el.py (67%) diff --git a/setup.py b/setup.py index 6f29e1efa..d579fd20e 100755 --- a/setup.py +++ b/setup.py @@ -40,6 +40,7 @@ MOD_NAMES = [ "spacy.lexeme", "spacy.vocab", "spacy.attrs", + "spacy.kb", "spacy.morphology", "spacy.pipeline.pipes", "spacy.syntax.stateclass", diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 92a0c8b95..43f3e83e8 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -4,6 +4,7 @@ from preshed.maps cimport PreshMap from libcpp.vector cimport vector from libc.stdint cimport int32_t, int64_t from .typedefs cimport hash_t +from .strings cimport hash_string # Internal struct, for storage and disambiguation. This isn't what we return @@ -32,10 +33,10 @@ cdef struct _EntryC: cdef struct _AliasC: # All entry candidates for this alias - const vector[int64_t] entry_indices + vector[int64_t] entry_indices # Prior probability P(entity|alias) - should sum up to (at most) 1. - const vector[float] probs + vector[float] probs cdef class KnowledgeBase: @@ -94,13 +95,21 @@ cdef class KnowledgeBase: feats_row=feats_row, prob=prob )) - self._index[entity_key] = entity_index + self._entry_index[entity_key] = entity_index return entity_index - cdef inline int64_t c_add_aliases(self, hash_t alias_key, vector[int64_t] entry_indices, vector[float] probs): + cdef inline int64_t c_add_aliases(self, hash_t alias_key, entities, probabilities): """Connect a mention to a list of potential entities with their prior probabilities .""" cdef int64_t alias_index = self._aliases_table.size() + cdef vector[int64_t] entry_indices + cdef vector[float] probs + + for entity, prob in zip(entities, probs): + entry_index = self._entry_index[hash_string(entity)] + entry_indices.push_back(entry_index) + probs.push_back(prob) + self._aliases_table.push_back( _AliasC( entry_indices=entry_indices, diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 0f6a7aecc..d2b8fffe1 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,34 +1,42 @@ -from .strings cimport hash_string +# cython: profile=True +# coding: utf8 +from preshed.maps import PreshMap cdef class KnowledgeBase: + + def __init__(self): + self._entry_index = PreshMap() + self._alias_index = PreshMap() + self.mem = Pool() + + def __len__(self): return self._entries.size() - def add_entity(self, entity_id: str, float prob, vectors=None, features=None): + def add_entity(self, unicode entity_id, float prob, vectors=None, features=None): + cdef hash_t id_hash = hash_string(entity_id) + # TODO: more friendly check for non-unique name - if entity_id in self: + if id_hash in self._entry_index: return - cdef hash_t id_hash = hash_string(entity_id) + cdef int32_t dummy_value = 342 self.c_add_entity(entity_key=id_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value) # TODO self._vectors_table.get_pointer(vectors), # self._features_table.get(features)) - def add_alias(self, alias, entities, probabilities): + def add_alias(self, unicode alias, entities, probabilities): """For a given alias, add its potential entities and prior probabilies to the KB.""" cdef hash_t alias_hash = hash_string(alias) - cdef hash_t entity_hash = 0 - cdef int64_t entity_index = 0 - - cdef vector[int64_t] entry_indices = [self._entry_index[hash_string(entity)] for entity in entities] - - self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probabilities) # TODO: check that alias hadn't been defined before # TODO: check that entity is already in this KB (entity_index is OK) # TODO: check sum(probabilities) <= 1 # TODO: check len(entities) == len(probabilities) + self.c_add_aliases(alias_key=alias_hash, entities=entities, probabilities=probabilities) + + diff --git a/sandbox_test_sofie/__init__.py b/spacy/sandbox_test_sofie/__init__.py similarity index 100% rename from sandbox_test_sofie/__init__.py rename to spacy/sandbox_test_sofie/__init__.py diff --git a/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py similarity index 67% rename from sandbox_test_sofie/testing_el.py rename to spacy/sandbox_test_sofie/testing_el.py index 7883e44d4..840d890b5 100644 --- a/sandbox_test_sofie/testing_el.py +++ b/spacy/sandbox_test_sofie/testing_el.py @@ -1,4 +1,16 @@ import spacy +from spacy.kb import KnowledgeBase + + +def create_kb(): + mykb = KnowledgeBase() + print("kb size", len(mykb)) + + entity_id = "Q42" + mykb.add_entity(entity_id=entity_id, prob=0.5) + print("adding entity", entity_id) + + print("kb size", len(mykb)) def add_el(): @@ -23,4 +35,5 @@ def add_el(): if __name__ == "__main__": - add_el() + # add_el() + create_kb()