* Rename vec to repvec

This commit is contained in:
Matthew Honnibal 2015-01-22 02:06:22 +11:00
parent 8b9d913d97
commit d460c28838
1 changed files with 7 additions and 15 deletions

View File

@ -22,7 +22,7 @@ DEF MAX_VEC_SIZE = 100000
cdef float[MAX_VEC_SIZE] EMPTY_VEC cdef float[MAX_VEC_SIZE] EMPTY_VEC
memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC)) memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC))
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
EMPTY_LEXEME.vec = EMPTY_VEC EMPTY_LEXEME.repvec = EMPTY_VEC
cdef class Vocab: cdef class Vocab:
@ -38,15 +38,12 @@ cdef class Vocab:
if data_dir is not None: if data_dir is not None:
if not path.exists(data_dir): if not path.exists(data_dir):
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
assert EMPTY_LEXEME.vec != NULL
if data_dir is not None: if data_dir is not None:
if not path.isdir(data_dir): if not path.isdir(data_dir):
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
self.strings.load(path.join(data_dir, 'strings.txt')) self.strings.load(path.join(data_dir, 'strings.txt'))
self.load_lexemes(path.join(data_dir, 'lexemes.bin')) self.load_lexemes(path.join(data_dir, 'lexemes.bin'))
self.load_vectors(path.join(data_dir, 'vec.bin')) self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
for i in range(self.lexemes.size()):
assert self.lexemes[i].vec != NULL, repr(self.strings[self.lexemes[i].sic])
def __len__(self): def __len__(self):
"""The current number of lexemes stored.""" """The current number of lexemes stored."""
@ -59,7 +56,6 @@ cdef class Vocab:
cdef LexemeC* lex cdef LexemeC* lex
lex = <LexemeC*>self._map.get(c_str.key) lex = <LexemeC*>self._map.get(c_str.key)
if lex != NULL: if lex != NULL:
assert lex.vec != NULL
return lex return lex
if c_str.n < 3: if c_str.n < 3:
mem = self.mem mem = self.mem
@ -67,7 +63,6 @@ cdef class Vocab:
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1) lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
props = self.lexeme_props_getter(py_str) props = self.lexeme_props_getter(py_str)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
assert lex.vec != NULL
if mem is self.mem: if mem is self.mem:
lex.id = self.lexemes.size() lex.id = self.lexemes.size()
self._add_lex_to_vocab(c_str.key, lex) self._add_lex_to_vocab(c_str.key, lex)
@ -119,8 +114,6 @@ cdef class Vocab:
lex.id = self.lexemes.size() lex.id = self.lexemes.size()
self._add_lex_to_vocab(c_str.key, lex) self._add_lex_to_vocab(c_str.key, lex)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
assert lex.vec != NULL
assert lex.sic < 1000000
def dump(self, loc): def dump(self, loc):
if path.exists(loc): if path.exists(loc):
@ -159,7 +152,7 @@ cdef class Vocab:
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1) lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
# Copies data from the file into the lexeme # Copies data from the file into the lexeme
st = fread(lexeme, sizeof(LexemeC), 1, fp) st = fread(lexeme, sizeof(LexemeC), 1, fp)
lexeme.vec = EMPTY_VEC lexeme.repvec = EMPTY_VEC
if st != 1: if st != 1:
break break
self._map.set(key, lexeme) self._map.set(key, lexeme)
@ -169,7 +162,7 @@ cdef class Vocab:
i += 1 i += 1
fclose(fp) fclose(fp)
def load_vectors(self, loc): def load_rep_vectors(self, loc):
file_ = _CFile(loc, 'rb') file_ = _CFile(loc, 'rb')
cdef int32_t word_len cdef int32_t word_len
cdef int32_t vec_len cdef int32_t vec_len
@ -202,11 +195,10 @@ cdef class Vocab:
for i in range(self.lexemes.size()): for i in range(self.lexemes.size()):
# Cast away the const, cos we can modify our lexemes # Cast away the const, cos we can modify our lexemes
lex = <LexemeC*>self.lexemes[i] lex = <LexemeC*>self.lexemes[i]
if lex.sic < vectors.size(): if lex.norm1 < vectors.size():
lex.vec = vectors[lex.sic] lex.repvec = vectors[lex.norm1]
else: else:
lex.vec = EMPTY_VEC lex.repvec = EMPTY_VEC
assert lex.vec != NULL
def write_binary_vectors(in_loc, out_loc): def write_binary_vectors(in_loc, out_loc):