* Remove happax stuff

This commit is contained in:
Matthew Honnibal 2014-08-02 22:11:28 +01:00
parent 6050209f65
commit f11c8e22eb
2 changed files with 3 additions and 25 deletions

View File

@ -26,7 +26,6 @@ from spacy.lexeme cimport Orthography
cdef class Language: cdef class Language:
cdef object name cdef object name
cdef FixedTable happax
cdef Vocab* vocab cdef Vocab* vocab
cdef Vocab* distri cdef Vocab* distri
cdef Vocab* ortho cdef Vocab* ortho
@ -42,5 +41,3 @@ cdef class Language:
cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed, cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
int split, size_t length) int split, size_t length)
cdef Orthography* init_orth(self, StringHash hashed, unicode lex) cdef Orthography* init_orth(self, StringHash hashed, unicode lex)
cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr addr)

View File

@ -54,14 +54,10 @@ def set_orth_flags(lex, length):
return 0 return 0
DEF MAX_HAPPAX = 1048576
cdef class Language: cdef class Language:
def __cinit__(self, name): def __cinit__(self, name):
self.name = name self.name = name
self.bacov = {} self.bacov = {}
self.happax = FixedTable(MAX_HAPPAX)
self.vocab = new Vocab() self.vocab = new Vocab()
self.ortho = new Vocab() self.ortho = new Vocab()
self.distri = new Vocab() self.distri = new Vocab()
@ -84,7 +80,6 @@ cdef class Language:
length = len(token_string) length = len(token_string)
hashed = self.hash_string(token_string, length) hashed = self.hash_string(token_string, length)
word.tail = self._add(hashed, lex, 0, len(lex)) word.tail = self._add(hashed, lex, 0, len(lex))
self._happax_to_vocab(hashed, <Lexeme_addr>word.tail)
word = word.tail word = word.tail
def load_clusters(self): def load_clusters(self):
@ -125,28 +120,15 @@ cdef class Language:
cdef StringHash hashed = self.hash_string(string, length) cdef StringHash hashed = self.hash_string(string, length)
# First, check words seen 2+ times # First, check words seen 2+ times
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed] cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
if word_ptr == NULL:
# Now check words seen exactly once
word_ptr = <Lexeme*>self.happax.get(hashed)
if word_ptr == NULL: if word_ptr == NULL:
start = self.find_split(string, length) if start == -1 else start start = self.find_split(string, length) if start == -1 else start
word_ptr = self._add(hashed, string, start, length) word_ptr = self._add(hashed, string, start, length)
else:
# Second time word seen, move to vocab
self._happax_to_vocab(hashed, <Lexeme_addr>word_ptr)
return <Lexeme_addr>word_ptr return <Lexeme_addr>word_ptr
cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr word_ptr):
self.vocab[0][hashed] = word_ptr
self.happax.erase(hashed)
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length): cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
cdef size_t i cdef size_t i
word = self.init_lexeme(string, hashed, split, length) word = self.init_lexeme(string, hashed, split, length)
if self.happax.keys[hashed % self.happax.size] != 0: self.vocab[0][hashed] = <size_t>word
self._happax_to_vocab(self.happax.keys[hashed % self.happax.size],
self.happax.values[hashed % self.happax.size])
self.happax.insert(hashed, <size_t>word)
self.bacov[hashed] = string.encode('utf8') self.bacov[hashed] = string.encode('utf8')
return word return word
@ -211,7 +193,6 @@ cdef class Language:
# Now recurse, and deal with the tail # Now recurse, and deal with the tail
if tail_string: if tail_string:
word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string)) word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string))
self._happax_to_vocab(word.tail.sic, <Lexeme_addr>word.tail)
return word return word
cdef Orthography* init_orth(self, StringHash hashed, unicode lex): cdef Orthography* init_orth(self, StringHash hashed, unicode lex):