From 5b81ee716fad3caebba9f801113249af874a49bd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 31 Jul 2014 17:40:43 +0100 Subject: [PATCH] * Use a sparse_hash_map to store happax vocab items, with a max size. --- spacy/spacy.pxd | 3 +++ spacy/spacy.pyx | 24 +++++++++++++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd index 88c4c36a2..2c8b5e141 100644 --- a/spacy/spacy.pxd +++ b/spacy/spacy.pxd @@ -2,12 +2,14 @@ from libcpp.vector cimport vector from libc.stdint cimport uint64_t from sparsehash.dense_hash_map cimport dense_hash_map +from sparsehash.sparse_hash_map cimport sparse_hash_map # Circular import problems here ctypedef size_t Lexeme_addr ctypedef uint64_t StringHash ctypedef dense_hash_map[StringHash, size_t] Vocab +ctypedef sparse_hash_map[StringHash, size_t] SparseVocab from spacy.lexeme cimport Lexeme from spacy.tokens cimport Tokens @@ -25,6 +27,7 @@ from spacy.lexeme cimport Orthography cdef class Language: cdef object name + cdef SparseVocab* happax cdef Vocab* vocab cdef Vocab* distri cdef Vocab* ortho diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index 2696003c4..cadc4407c 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -53,13 +53,18 @@ def set_orth_flags(lex, length): return 0 +DEF MAX_HAPPAX = 1000000 + + cdef class Language: def __cinit__(self, name): self.name = name self.bacov = {} + self.happax = new SparseVocab() self.vocab = new Vocab() self.ortho = new Vocab() self.distri = new Vocab() + self.happax[0].set_deleted_key(0) self.vocab[0].set_empty_key(0) self.distri[0].set_empty_key(0) self.ortho[0].set_empty_key(0) @@ -114,15 +119,28 @@ cdef class Language: if length == 0: return &BLANK_WORD cdef StringHash hashed = self.hash_string(string, length) + # First, check words seen 2+ times cdef Lexeme* word_ptr = self.vocab[0][hashed] if word_ptr == NULL: - start = self.find_split(string, length) if start == -1 else start - word_ptr = self._add(hashed, string, start, length) + # Now check words seen exactly once + word_ptr = self.happax[0][hashed] + if word_ptr == NULL: + start = self.find_split(string, length) if start == -1 else start + word_ptr = self._add(hashed, string, start, length) + else: + # Second time word seen, move to vocab + self.vocab[0][hashed] = word_ptr + self.happax[0].erase(hashed) return word_ptr cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length): + cdef size_t i + cdef sparse_hash_map[StringHash, size_t].iterator it + if self.happax[0].size() >= MAX_HAPPAX: + # Delete last element. + self.happax[0].erase(self.happax[0].end()) word = self.init_lexeme(string, hashed, split, length) - self.vocab[0][hashed] = word + self.happax[0][hashed] = word self.bacov[hashed] = string return word