diff --git a/spacy/kb.pxd b/spacy/kb.pxd new file mode 100644 index 000000000..939030098 --- /dev/null +++ b/spacy/kb.pxd @@ -0,0 +1,93 @@ +"""Knowledge-base for entity or concept linking.""" +from cymem.cymem cimport Pool +from preshed.maps cimport PreshMap +from libcpp.vector cimport vector +from libc.stdint cimport int32_t +from spacy.typedefs cimport attr_t + + +# Internal struct, for storage and disambiguation. This isn't what we return +# to the user as the answer to "here's your entity". It's the minimum number +# of bits we need to keep track of the answers. +cdef struct _EntryC: + + # Allows retrieval of one or more vectors. + # Each element of vector_rows should be an index into a vectors table. + # Every entry should have the same number of vectors, so we can avoid storing + # the number of vectors in each knowledge-base struct + const int32_t* vector_rows + + # Allows retrieval of a struct of non-vector features. We could make this a + # pointer, but we have 32 bits left over in the struct after prob, so we'd + # like this to only be 32 bits. We can also set this to -1, for the common + # case where there are no features. + int32_t feats_row + float prob # log probability of entity, based on corpus frequency + + +cdef class KnowledgeBase: + cdef Pool mem + + # This maps 64bit keys to 64bit values. Here the key would be a hash of + # a unique string name for the entity, and the value would be the position + # of the _EntryC struct in our vector. + # The PreshMap is pretty space efficient, as it uses open addressing. So + # the only overhead is the vacancy rate, which is approximately 30%. + cdef PreshMap _index + + # Each entry takes 128 bits, and again we'll have a 30% or so overhead for + # over allocation. + # In total we end up with (N*128*1.3)+(N*128*1.3) bits for N entries. + # Storing 1m entries would take 41.6mb under this scheme. + cdef vector[_EntryC] _entries + + # This is the part which might take more space: storing various + # categorical features for the entries, and storing vectors for disambiguation + # and possibly usage. + # If each entry gets a 300-dimensional vector, for 1m entries we would need + # 1.2gb. That gets expensive fast. What might be better is to avoid learning + # a unique vector for every entity. We could instead have a compositional + # model, that embeds different features of the entities into vectors. We'll + # still want some per-entity features, like the Wikipedia text or entity + # co-occurrence. Hopefully those vectors can be narrow, e.g. 64 dimensions. + cdef object _vectors_table + + # It's very useful to track categorical features, at least for output, even + # if they're not useful in the model itself. For instance, we should be + # able to track stuff like a person's date of birth or whatever. This can + # easily make the KB bigger, but if this isn't needed by the model, and it's + # optional data, we can let users configure a DB as the backend for this. + cdef object _features_table + + # This should map mention hashes to (entry_id, prob) tuples. The probability + # should be P(entity | mention), which is pretty important to know. + # We can pack both pieces of information into a 64-bit vale, to keep things + # efficient. + cdef object _aliases_table + + def __len__(self): + return self._entries.size() + + def add(self, name, float prob, vectors=None, features=None, aliases=None): + if name in self: + return + cdef attr_t orth = get_string_name(name) + self.c_add(orth, prob, self._vectors_table.get_pointer(vectors), + self._features_table.get(features)) + for alias in aliases: + self._aliases_table.add(alias, orth) + + cdef void c_add(self, attr_t orth, float prob, const int32_t* vector_rows, + int feats_row) nogil: + """Add an entry to the knowledge base.""" + # This is what we'll map the orth to. It's where the entry will sit + # in the vector of entries, so we can get it later. + cdef int64_t index = self.c.size() + self._entries.push_back( + _EntryC( + vector_rows=vector_rows, + feats_row=feats_row, + prob=prob + )) + self._index[orth] = index + return index \ No newline at end of file