kb snippet, draft by Matt (wip)

2019-03-15 11:17:35 +01:00 · 2019-03-15 11:17:35 +01:00 · 7f37737878
parent 735fc2a735
commit 7f37737878
1 changed files with 93 additions and 0 deletions
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@ -0,0 +1,93 @@
+"""Knowledge-base for entity or concept linking."""
+from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMap
+from libcpp.vector cimport vector
+from libc.stdint cimport int32_t
+from spacy.typedefs cimport attr_t
+
+
+# Internal struct, for storage and disambiguation. This isn't what we return
+# to the user as the answer to "here's your entity". It's the minimum number
+# of bits we need to keep track of the answers.
+cdef struct _EntryC:
+
+    # Allows retrieval of one or more vectors.
+    # Each element of vector_rows should be an index into a vectors table.
+    # Every entry should have the same number of vectors, so we can avoid storing
+    # the number of vectors in each knowledge-base struct
+    const int32_t* vector_rows
+
+    # Allows retrieval of a struct of non-vector features. We could make this a
+    # pointer, but we have 32 bits left over in the struct after prob, so we'd
+    # like this to only be 32 bits. We can also set this to -1, for the common
+    # case where there are no features.
+    int32_t feats_row
+    float prob # log probability of entity, based on corpus frequency
+
+
+cdef class KnowledgeBase:
+    cdef Pool mem
+
+    # This maps 64bit keys to 64bit values. Here the key would be a hash of
+    # a unique string name for the entity, and the value would be the position
+    # of the _EntryC struct in our vector.
+    # The PreshMap is pretty space efficient, as it uses open addressing. So
+    # the only overhead is the vacancy rate, which is approximately 30%.
+    cdef PreshMap _index
+
+    # Each entry takes 128 bits, and again we'll have a 30% or so overhead for
+    # over allocation.
+    # In total we end up with (N*128*1.3)+(N*128*1.3) bits for N entries.
+    # Storing 1m entries would take 41.6mb under this scheme.
+    cdef vector[_EntryC] _entries
+
+    # This is the part which might take more space: storing various
+    # categorical features for the entries, and storing vectors for disambiguation
+    # and possibly usage.
+    # If each entry gets a 300-dimensional vector, for 1m entries we would need
+    # 1.2gb. That gets expensive fast. What might be better is to avoid learning
+    # a unique vector for every entity. We could instead have a compositional
+    # model, that embeds different features of the entities into vectors. We'll
+    # still want some per-entity features, like the Wikipedia text or entity
+    # co-occurrence. Hopefully those vectors can be narrow, e.g. 64 dimensions.
+    cdef object _vectors_table
+
+    # It's very useful to track categorical features, at least for output, even
+    # if they're not useful in the model itself. For instance, we should be
+    # able to track stuff like a person's date of birth or whatever. This can
+    # easily make the KB bigger, but if this isn't needed by the model, and it's
+    # optional data, we can let users configure a DB as the backend for this.
+    cdef object _features_table
+
+    # This should map mention hashes to (entry_id, prob) tuples. The probability
+    # should be P(entity | mention), which is pretty important to know.
+    # We can pack both pieces of information into a 64-bit vale, to keep things
+    # efficient.
+    cdef object _aliases_table
+
+    def __len__(self):
+        return self._entries.size()
+
+    def add(self, name, float prob, vectors=None, features=None, aliases=None):
+        if name in self:
+            return
+        cdef attr_t orth = get_string_name(name)
+        self.c_add(orth, prob, self._vectors_table.get_pointer(vectors),
+                   self._features_table.get(features))
+        for alias in aliases:
+            self._aliases_table.add(alias, orth)
+
+    cdef void c_add(self, attr_t orth, float prob, const int32_t* vector_rows,
+                    int feats_row) nogil:
+        """Add an entry to the knowledge base."""
+        # This is what we'll map the orth to. It's where the entry will sit
+        # in the vector of entries, so we can get it later.
+        cdef int64_t index = self.c.size()
+        self._entries.push_back(
+            _EntryC(
+                vector_rows=vector_rows,
+                feats_row=feats_row,
+                prob=prob
+            ))
+        self._index[orth] = index
+        return index