mirror of https://github.com/explosion/spaCy.git
* Removed happax. Not sure if good idea.
This commit is contained in:
parent
edd38a84b1
commit
18fb76b2c4
1
setup.py
1
setup.py
|
@ -48,7 +48,6 @@ exts = [
|
||||||
Extension("spacy.en_ptb", ["spacy/en_ptb.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.en_ptb", ["spacy/en_ptb.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy._hashing", ["spacy/_hashing.pyx"], language="c++", include_dirs=includes),
|
|
||||||
Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
|
Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++",
|
||||||
include_dirs=includes),
|
include_dirs=includes),
|
||||||
|
|
|
@ -2,7 +2,6 @@ from libcpp.vector cimport vector
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
|
|
||||||
from sparsehash.dense_hash_map cimport dense_hash_map
|
from sparsehash.dense_hash_map cimport dense_hash_map
|
||||||
from _hashing cimport FixedTable
|
|
||||||
|
|
||||||
# Circular import problems here
|
# Circular import problems here
|
||||||
ctypedef size_t Lexeme_addr
|
ctypedef size_t Lexeme_addr
|
||||||
|
@ -25,7 +24,6 @@ from spacy.lexeme cimport Orthography
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
cdef object name
|
cdef object name
|
||||||
cdef FixedTable happax
|
|
||||||
cdef Vocab* vocab
|
cdef Vocab* vocab
|
||||||
cdef Vocab* distri
|
cdef Vocab* distri
|
||||||
cdef Vocab* ortho
|
cdef Vocab* ortho
|
||||||
|
@ -41,7 +39,3 @@ cdef class Language:
|
||||||
cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
|
cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
|
||||||
int split, size_t length)
|
int split, size_t length)
|
||||||
cdef Orthography* init_orth(self, StringHash hashed, unicode lex)
|
cdef Orthography* init_orth(self, StringHash hashed, unicode lex)
|
||||||
|
|
||||||
cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr addr)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -55,14 +55,10 @@ def set_orth_flags(lex, length):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
DEF MAX_HAPPAX = 1048576
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
def __cinit__(self, name):
|
def __cinit__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.bacov = {}
|
self.bacov = {}
|
||||||
self.happax = FixedTable(MAX_HAPPAX)
|
|
||||||
self.vocab = new Vocab()
|
self.vocab = new Vocab()
|
||||||
self.ortho = new Vocab()
|
self.ortho = new Vocab()
|
||||||
self.distri = new Vocab()
|
self.distri = new Vocab()
|
||||||
|
@ -85,7 +81,6 @@ cdef class Language:
|
||||||
length = len(token_string)
|
length = len(token_string)
|
||||||
hashed = self.hash_string(token_string, length)
|
hashed = self.hash_string(token_string, length)
|
||||||
word.tail = self._add(hashed, lex, 0, len(lex))
|
word.tail = self._add(hashed, lex, 0, len(lex))
|
||||||
self._happax_to_vocab(hashed, <Lexeme_addr>word.tail)
|
|
||||||
word = word.tail
|
word = word.tail
|
||||||
|
|
||||||
def load_clusters(self):
|
def load_clusters(self):
|
||||||
|
@ -127,27 +122,14 @@ cdef class Language:
|
||||||
# First, check words seen 2+ times
|
# First, check words seen 2+ times
|
||||||
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
|
cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
|
||||||
if word_ptr == NULL:
|
if word_ptr == NULL:
|
||||||
# Now check words seen exactly once
|
start = self.find_split(string, length) if start == -1 else start
|
||||||
word_ptr = <Lexeme*>self.happax.get(hashed)
|
word_ptr = self._add(hashed, string, start, length)
|
||||||
if word_ptr == NULL:
|
|
||||||
start = self.find_split(string, length) if start == -1 else start
|
|
||||||
word_ptr = self._add(hashed, string, start, length)
|
|
||||||
else:
|
|
||||||
# Second time word seen, move to vocab
|
|
||||||
self._happax_to_vocab(hashed, <Lexeme_addr>word_ptr)
|
|
||||||
return <Lexeme_addr>word_ptr
|
return <Lexeme_addr>word_ptr
|
||||||
|
|
||||||
cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr word_ptr):
|
|
||||||
self.vocab[0][hashed] = word_ptr
|
|
||||||
self.happax.erase(hashed)
|
|
||||||
|
|
||||||
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
|
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
|
||||||
cdef size_t i
|
cdef size_t i
|
||||||
word = self.init_lexeme(string, hashed, split, length)
|
word = self.init_lexeme(string, hashed, split, length)
|
||||||
cdef Lexeme* clobbered = <Lexeme*>self.happax.insert(hashed, <size_t>word)
|
self.vocab[0][hashed] = <Lexeme_addr>word
|
||||||
if clobbered != NULL:
|
|
||||||
#free(clobbered)
|
|
||||||
pass
|
|
||||||
self.bacov[hashed] = string
|
self.bacov[hashed] = string
|
||||||
return word
|
return word
|
||||||
|
|
||||||
|
@ -212,7 +194,6 @@ cdef class Language:
|
||||||
# Now recurse, and deal with the tail
|
# Now recurse, and deal with the tail
|
||||||
if tail_string:
|
if tail_string:
|
||||||
word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string))
|
word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string))
|
||||||
self._happax_to_vocab(word.tail.sic, <Lexeme_addr>word.tail)
|
|
||||||
return word
|
return word
|
||||||
|
|
||||||
cdef Orthography* init_orth(self, StringHash hashed, unicode lex):
|
cdef Orthography* init_orth(self, StringHash hashed, unicode lex):
|
||||||
|
|
Loading…
Reference in New Issue