diff --git a/spacy/kb.pxd b/spacy/kb.pxd index f4f60d478..d0f31ebb4 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -112,4 +112,23 @@ cdef class KnowledgeBase: self._alias_index[alias_key] = alias_index return alias_index + cdef inline create_empty_vectors(self): + """ + Making sure the first element of each vector is a dummy, + because the PreshMap maps pointing to indices in these vectors can not contain 0 as value + cf. https://github.com/explosion/preshed/issues/17 + """ + cdef int32_t dummy_value = 0 + self._entries.push_back( + _EntryC( + vector_rows=&dummy_value, + feats_row=dummy_value, + prob=dummy_value + )) + self._aliases_table.push_back( + _AliasC( + entry_indices=[dummy_value], + probs=[dummy_value] + )) + diff --git a/spacy/kb.pyx b/spacy/kb.pyx index ea23e5373..f67519260 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,5 +1,6 @@ # cython: profile=True # coding: utf8 +from spacy.errors import user_warning cdef class KnowledgeBase: @@ -8,6 +9,7 @@ cdef class KnowledgeBase: self._alias_index = PreshMap() self.mem = Pool() self.strings = StringStore() + self.create_empty_vectors() def __len__(self): return self.get_size_entities() @@ -21,8 +23,9 @@ cdef class KnowledgeBase: def add_entity(self, unicode entity_id, float prob, vectors=None, features=None): cdef hash_t id_hash = self.strings.add(entity_id) - # TODO: more friendly check for non-unique name + # Return if this entity was added before if id_hash in self._entry_index: + user_warning("Entity " + entity_id + " already exists in the KB") return cdef int32_t dummy_value = 342 @@ -33,6 +36,12 @@ cdef class KnowledgeBase: def add_alias(self, unicode alias, entities, probabilities): """For a given alias, add its potential entities and prior probabilies to the KB.""" cdef hash_t alias_hash = self.strings.add(alias) + + # Return if this alias was added before + if alias_hash in self._alias_index: + user_warning("Alias " + alias + " already exists in the KB") + return + cdef hash_t entity_hash cdef vector[int64_t] entry_indices @@ -47,12 +56,12 @@ cdef class KnowledgeBase: entry_indices.push_back(int(entry_index)) probs.push_back(float(prob)) - # TODO: check that alias hadn't been defined before # TODO: check sum(probabilities) <= 1 # TODO: check len(entities) == len(probabilities) self.c_add_aliases(alias_key=alias_hash, entry_indices=entry_indices, probs=probs) + def get_candidates(self, unicode alias): cdef hash_t alias_hash = self.strings.add(alias) alias_index = self._alias_index.get(alias_hash) diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py index b5b529d4b..734eddd8d 100644 --- a/spacy/sandbox_test_sofie/testing_el.py +++ b/spacy/sandbox_test_sofie/testing_el.py @@ -1,23 +1,28 @@ +# coding: utf-8 import spacy from spacy.kb import KnowledgeBase def create_kb(): mykb = KnowledgeBase() + print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) # adding entities entity_0 = "Q0" # douglas adams - mykb.add_entity(entity_id=entity_0, prob=0.5) print(" adding entity", entity_0) + mykb.add_entity(entity_id=entity_0, prob=0.5) entity_42 = "Q42" # douglas adams - mykb.add_entity(entity_id=entity_42, prob=0.5) print(" adding entity", entity_42) + mykb.add_entity(entity_id=entity_42, prob=0.5) entity_5301561 = "Q5301561" - mykb.add_entity(entity_id=entity_5301561, prob=0.5) print(" adding entity", entity_5301561) + mykb.add_entity(entity_id=entity_5301561, prob=0.5) + + print(" adding entity", entity_5301561) + mykb.add_entity(entity_id=entity_5301561, prob=0.5) print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) @@ -32,6 +37,15 @@ def create_kb(): candidates = mykb.get_candidates(alias) print(" ", candidates) + print(" adding alias", alias) + mykb.add_alias(alias=alias, entities=["Q42"], probabilities=[0.9]) + + print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) + + print("candidates for", alias) + candidates = mykb.get_candidates(alias) + print(" ", candidates) + def add_el(): nlp = spacy.load('en_core_web_sm')