From 18fb76b2c4965e5f3b2ff7e96e8e0c65587d1702 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Sat, 2 Aug 2014 20:53:35 +0100
Subject: [PATCH] * Removed happax. Not sure if good idea.

---
 setup.py        |  1 -
 spacy/spacy.pxd |  6 ------
 spacy/spacy.pyx | 25 +++----------------------
 3 files changed, 3 insertions(+), 29 deletions(-)

diff --git a/setup.py b/setup.py
index 50a8dd271..eadfade84 100644
--- a/setup.py
+++ b/setup.py
@@ -48,7 +48,6 @@ exts = [
     Extension("spacy.en_ptb", ["spacy/en_ptb.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy._hashing", ["spacy/_hashing.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
               include_dirs=includes),
diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd
index 6501a8a2b..fdb43df74 100644
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@@ -2,7 +2,6 @@ from libcpp.vector cimport vector
 from libc.stdint cimport uint64_t
 
 from sparsehash.dense_hash_map cimport dense_hash_map
-from _hashing cimport FixedTable
 
 # Circular import problems here
 ctypedef size_t Lexeme_addr
@@ -25,7 +24,6 @@ from spacy.lexeme cimport Orthography
 
 cdef class Language:
     cdef object name
-    cdef FixedTable happax
     cdef Vocab* vocab
     cdef Vocab* distri
     cdef Vocab* ortho
@@ -41,7 +39,3 @@ cdef class Language:
     cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
                              int split, size_t length)
     cdef Orthography* init_orth(self, StringHash hashed, unicode lex)
-
-    cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr addr)
-
- 
diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx
index d3157ded7..d896b922b 100644
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@@ -55,14 +55,10 @@ def set_orth_flags(lex, length):
     return 0
 
 
-DEF MAX_HAPPAX = 1048576
-
-
 cdef class Language:
     def __cinit__(self, name):
         self.name = name
         self.bacov = {}
-        self.happax = FixedTable(MAX_HAPPAX)
         self.vocab = new Vocab()
         self.ortho = new Vocab()
         self.distri = new Vocab()
@@ -85,7 +81,6 @@ cdef class Language:
                 length = len(token_string)
                 hashed = self.hash_string(token_string, length)
                 word.tail = self._add(hashed, lex, 0, len(lex))
-                self._happax_to_vocab(hashed, <Lexeme_addr>word.tail)
                 word = word.tail
 
     def load_clusters(self):
@@ -127,27 +122,14 @@ cdef class Language:
         # First, check words seen 2+ times
         cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
         if word_ptr == NULL:
-            # Now check words seen exactly once
-            word_ptr = <Lexeme*>self.happax.get(hashed)
-            if word_ptr == NULL:
-                start = self.find_split(string, length) if start == -1 else start
-                word_ptr = self._add(hashed, string, start, length)
-            else:
-                # Second time word seen, move to vocab
-                self._happax_to_vocab(hashed, <Lexeme_addr>word_ptr)
+            start = self.find_split(string, length) if start == -1 else start
+            word_ptr = self._add(hashed, string, start, length)
         return <Lexeme_addr>word_ptr
 
-    cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr word_ptr):
-        self.vocab[0][hashed] = word_ptr
-        self.happax.erase(hashed)
-
     cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
         cdef size_t i
         word = self.init_lexeme(string, hashed, split, length)
-        cdef Lexeme* clobbered = <Lexeme*>self.happax.insert(hashed, <size_t>word)
-        if clobbered != NULL:
-            #free(clobbered)
-            pass
+        self.vocab[0][hashed] = <Lexeme_addr>word
         self.bacov[hashed] = string
         return word   
 
@@ -212,7 +194,6 @@ cdef class Language:
         # Now recurse, and deal with the tail
         if tail_string:
             word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string))
-            self._happax_to_vocab(word.tail.sic, <Lexeme_addr>word.tail)
         return word
 
     cdef Orthography* init_orth(self, StringHash hashed, unicode lex):