From 18fb76b2c4965e5f3b2ff7e96e8e0c65587d1702 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 2 Aug 2014 20:53:35 +0100 Subject: [PATCH] * Removed happax. Not sure if good idea. --- setup.py | 1 - spacy/spacy.pxd | 6 ------ spacy/spacy.pyx | 25 +++---------------------- 3 files changed, 3 insertions(+), 29 deletions(-) diff --git a/setup.py b/setup.py index 50a8dd271..eadfade84 100644 --- a/setup.py +++ b/setup.py @@ -48,7 +48,6 @@ exts = [ Extension("spacy.en_ptb", ["spacy/en_ptb.pyx"], language="c++", include_dirs=includes), Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes), Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes), - Extension("spacy._hashing", ["spacy/_hashing.pyx"], language="c++", include_dirs=includes), Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes), Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++", include_dirs=includes), diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd index 6501a8a2b..fdb43df74 100644 --- a/spacy/spacy.pxd +++ b/spacy/spacy.pxd @@ -2,7 +2,6 @@ from libcpp.vector cimport vector from libc.stdint cimport uint64_t from sparsehash.dense_hash_map cimport dense_hash_map -from _hashing cimport FixedTable # Circular import problems here ctypedef size_t Lexeme_addr @@ -25,7 +24,6 @@ from spacy.lexeme cimport Orthography cdef class Language: cdef object name - cdef FixedTable happax cdef Vocab* vocab cdef Vocab* distri cdef Vocab* ortho @@ -41,7 +39,3 @@ cdef class Language: cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed, int split, size_t length) cdef Orthography* init_orth(self, StringHash hashed, unicode lex) - - cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr addr) - - diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index d3157ded7..d896b922b 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -55,14 +55,10 @@ def set_orth_flags(lex, length): return 0 -DEF MAX_HAPPAX = 1048576 - - cdef class Language: def __cinit__(self, name): self.name = name self.bacov = {} - self.happax = FixedTable(MAX_HAPPAX) self.vocab = new Vocab() self.ortho = new Vocab() self.distri = new Vocab() @@ -85,7 +81,6 @@ cdef class Language: length = len(token_string) hashed = self.hash_string(token_string, length) word.tail = self._add(hashed, lex, 0, len(lex)) - self._happax_to_vocab(hashed, word.tail) word = word.tail def load_clusters(self): @@ -127,27 +122,14 @@ cdef class Language: # First, check words seen 2+ times cdef Lexeme* word_ptr = self.vocab[0][hashed] if word_ptr == NULL: - # Now check words seen exactly once - word_ptr = self.happax.get(hashed) - if word_ptr == NULL: - start = self.find_split(string, length) if start == -1 else start - word_ptr = self._add(hashed, string, start, length) - else: - # Second time word seen, move to vocab - self._happax_to_vocab(hashed, word_ptr) + start = self.find_split(string, length) if start == -1 else start + word_ptr = self._add(hashed, string, start, length) return word_ptr - cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr word_ptr): - self.vocab[0][hashed] = word_ptr - self.happax.erase(hashed) - cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length): cdef size_t i word = self.init_lexeme(string, hashed, split, length) - cdef Lexeme* clobbered = self.happax.insert(hashed, word) - if clobbered != NULL: - #free(clobbered) - pass + self.vocab[0][hashed] = word self.bacov[hashed] = string return word @@ -212,7 +194,6 @@ cdef class Language: # Now recurse, and deal with the tail if tail_string: word.tail = self.lookup(-1, tail_string, len(tail_string)) - self._happax_to_vocab(word.tail.sic, word.tail) return word cdef Orthography* init_orth(self, StringHash hashed, unicode lex):