* Revert prev change

This commit is contained in:
Matthew Honnibal 2015-07-27 10:58:15 +02:00
parent 6b586cdad4
commit 3d43f49f69
1 changed files with 31 additions and 28 deletions

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from libc.string cimport memset from libc.string cimport memset
from libc.stdint cimport int32_t, uint64_t from libc.stdint cimport int32_t
import bz2 import bz2
from os import path from os import path
@ -186,17 +186,12 @@ cdef class Vocab:
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
cdef CFile fp = CFile(bytes_loc, 'wb') cdef CFile fp = CFile(bytes_loc, 'wb')
cdef size_t st
cdef uint64_t size_of_lexeme = sizeof(LexemeC)
items = list(self._by_hash.items())
cdef uint64_t n_lexemes = len(items)
fp.write_from(&size_of_lexeme, 1, sizeof(size_of_lexeme))
fp.write_from(&n_lexemes, 1, sizeof(n_lexemes))
cdef size_t addr cdef size_t addr
cdef hash_t key cdef hash_t key
for key, addr in items: for key, addr in self._by_hash.items():
lexeme = <LexemeC*>addr lexeme = <LexemeC*>addr
fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1)
fp.write_from(lexeme, sizeof(LexemeC), 1) fp.write_from(lexeme, sizeof(LexemeC), 1)
fp.close() fp.close()
@ -204,28 +199,36 @@ cdef class Vocab:
self.strings.load(strings_loc) self.strings.load(strings_loc)
if not path.exists(loc): if not path.exists(loc):
raise IOError('LexemeCs file not found at %s' % loc) raise IOError('LexemeCs file not found at %s' % loc)
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
cdef CFile fp = CFile(loc, 'rb') cdef FILE* fp = fopen(<char*>bytes_loc, b'rb')
if fp == NULL:
cdef uint64_t size_of_lexeme raise IOError('lexemes data file present, but cannot open from ' % loc)
cdef uint64_t n_lexemes cdef size_t st
fp.read_into(&size_of_lexeme, 1, sizeof(size_of_lexeme)) cdef LexemeC* lexeme
assert size_of_lexeme == sizeof(LexemeC) cdef attr_t orth
fp.read_into(&n_lexemes, 1, sizeof(n_lexemes))
lexemes = <LexemeC*>self.mem.alloc(n_lexemes, sizeof(LexemeC))
cdef hash_t key cdef hash_t key
cdef unicode py_str cdef unicode py_str
cdef int i i = 0
self.length = n_lexemes while True:
for i in range(n_lexemes): st = fread(&orth, sizeof(orth), 1, fp)
fp.read_into(&lexemes[i], sizeof(LexemeC), 1) if st != 1:
lexemes[i].repvec = EMPTY_VEC break
py_str = self.strings[lexemes[i].orth] lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
# Copies data from the file into the lexeme
st = fread(lexeme, sizeof(LexemeC), 1, fp)
lexeme.repvec = EMPTY_VEC
if st != 1:
break
if orth != lexeme.orth:
# TODO: Improve this error message, pending resolution to Issue #64
raise IOError('Error reading from lexemes.bin. Integrity check fails.')
py_str = self.strings[orth]
key = hash_string(py_str) key = hash_string(py_str)
self._by_hash.set(key, &lexemes[i]) self._by_hash.set(key, lexeme)
self._by_orth.set(lexemes[i].orth, &lexemes[i]) self._by_orth.set(lexeme.orth, lexeme)
assert lexemes[i].length == len(py_str) self.length += 1
i += 1
fclose(fp)
def load_rep_vectors(self, loc): def load_rep_vectors(self, loc):
cdef CFile file_ = CFile(loc, b'rb') cdef CFile file_ = CFile(loc, b'rb')