* Use a sparse_hash_map to store happax vocab items, with a max size.

2014-07-31 17:40:43 +01:00 · 2014-07-31 17:40:43 +01:00 · 5b81ee716f
parent a235804730
commit 5b81ee716f
2 changed files with 24 additions and 3 deletions
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -2,12 +2,14 @@ from libcpp.vector cimport vector
 from libc.stdint cimport uint64_t

 from sparsehash.dense_hash_map cimport dense_hash_map
+from sparsehash.sparse_hash_map cimport sparse_hash_map


 # Circular import problems here
 ctypedef size_t Lexeme_addr
 ctypedef uint64_t StringHash
 ctypedef dense_hash_map[StringHash, size_t] Vocab
+ctypedef sparse_hash_map[StringHash, size_t] SparseVocab
 from spacy.lexeme cimport Lexeme

 from spacy.tokens cimport Tokens
@ -25,6 +27,7 @@ from spacy.lexeme cimport Orthography

 cdef class Language:
    cdef object name
+    cdef SparseVocab* happax
    cdef Vocab* vocab
    cdef Vocab* distri
    cdef Vocab* ortho
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -53,13 +53,18 @@ def set_orth_flags(lex, length):
    return 0


+DEF MAX_HAPPAX = 1000000
+
+
 cdef class Language:
    def __cinit__(self, name):
        self.name = name
        self.bacov = {}
+        self.happax = new SparseVocab()
        self.vocab = new Vocab()
        self.ortho = new Vocab()
        self.distri = new Vocab()
+        self.happax[0].set_deleted_key(0)
        self.vocab[0].set_empty_key(0)
        self.distri[0].set_empty_key(0)
        self.ortho[0].set_empty_key(0)
@ -114,15 +119,28 @@ cdef class Language:
        if length == 0:
            return <Lexeme_addr>&BLANK_WORD
        cdef StringHash hashed = self.hash_string(string, length)
+        # First, check words seen 2+ times
        cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
        if word_ptr == NULL:
-            start = self.find_split(string, length) if start == -1 else start
-            word_ptr = self._add(hashed, string, start, length)
+            # Now check words seen exactly once
+            word_ptr = <Lexeme*>self.happax[0][hashed]
+            if word_ptr == NULL:
+                start = self.find_split(string, length) if start == -1 else start
+                word_ptr = self._add(hashed, string, start, length)
+            else:
+                # Second time word seen, move to vocab
+                self.vocab[0][hashed] = <Lexeme_addr>word_ptr
+                self.happax[0].erase(hashed)
        return <Lexeme_addr>word_ptr

    cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
+        cdef size_t i
+        cdef sparse_hash_map[StringHash, size_t].iterator it
+        if self.happax[0].size() >= MAX_HAPPAX:
+            # Delete last element.
+            self.happax[0].erase(self.happax[0].end())
        word = self.init_lexeme(string, hashed, split, length)
-        self.vocab[0][hashed] = <Lexeme_addr>word
+        self.happax[0][hashed] = <Lexeme_addr>word
        self.bacov[hashed] = string
        return word