mirror of https://github.com/explosion/spaCy.git
* Fix platform-specific lexicon bug.
This commit is contained in:
parent
a1ed574b7b
commit
ce3ae8b5d9
|
@ -12,6 +12,7 @@ from .lexeme cimport Lexeme
|
||||||
from .strings cimport slice_unicode
|
from .strings cimport slice_unicode
|
||||||
from .strings cimport hash_string
|
from .strings cimport hash_string
|
||||||
from .orth cimport word_shape
|
from .orth cimport word_shape
|
||||||
|
from .typedefs cimport attr_t
|
||||||
|
|
||||||
from cymem.cymem cimport Address
|
from cymem.cymem cimport Address
|
||||||
|
|
||||||
|
@ -41,8 +42,8 @@ cdef class Vocab:
|
||||||
if data_dir is not None:
|
if data_dir is not None:
|
||||||
if not path.isdir(data_dir):
|
if not path.isdir(data_dir):
|
||||||
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
||||||
self.strings.load(path.join(data_dir, 'strings.txt'))
|
self.load_lexemes(path.join(data_dir, 'strings.txt'),
|
||||||
self.load_lexemes(path.join(data_dir, 'lexemes.bin'))
|
path.join(data_dir, 'lexemes.bin'))
|
||||||
if path.exists(path.join(data_dir, 'vec.bin')):
|
if path.exists(path.join(data_dir, 'vec.bin')):
|
||||||
self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
|
self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
|
||||||
|
|
||||||
|
@ -129,14 +130,15 @@ cdef class Vocab:
|
||||||
if key == 0:
|
if key == 0:
|
||||||
continue
|
continue
|
||||||
lexeme = <LexemeC*>self._map.c_map.cells[i].value
|
lexeme = <LexemeC*>self._map.c_map.cells[i].value
|
||||||
st = fwrite(&key, sizeof(key), 1, fp)
|
st = fwrite(&lexeme.orth, sizeof(lexeme.orth), 1, fp)
|
||||||
assert st == 1
|
assert st == 1
|
||||||
st = fwrite(lexeme, sizeof(LexemeC), 1, fp)
|
st = fwrite(lexeme, sizeof(LexemeC), 1, fp)
|
||||||
assert st == 1
|
assert st == 1
|
||||||
st = fclose(fp)
|
st = fclose(fp)
|
||||||
assert st == 0
|
assert st == 0
|
||||||
|
|
||||||
def load_lexemes(self, loc):
|
def load_lexemes(self, strings_loc, loc):
|
||||||
|
self.strings.load(strings_loc)
|
||||||
if not path.exists(loc):
|
if not path.exists(loc):
|
||||||
raise IOError('LexemeCs file not found at %s' % loc)
|
raise IOError('LexemeCs file not found at %s' % loc)
|
||||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||||
|
@ -144,10 +146,12 @@ cdef class Vocab:
|
||||||
assert fp != NULL
|
assert fp != NULL
|
||||||
cdef size_t st
|
cdef size_t st
|
||||||
cdef LexemeC* lexeme
|
cdef LexemeC* lexeme
|
||||||
|
cdef attr_t orth
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
|
cdef unicode py_str
|
||||||
i = 0
|
i = 0
|
||||||
while True:
|
while True:
|
||||||
st = fread(&key, sizeof(key), 1, fp)
|
st = fread(&orth, sizeof(orth), 1, fp)
|
||||||
if st != 1:
|
if st != 1:
|
||||||
break
|
break
|
||||||
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
||||||
|
@ -156,6 +160,9 @@ cdef class Vocab:
|
||||||
lexeme.repvec = EMPTY_VEC
|
lexeme.repvec = EMPTY_VEC
|
||||||
if st != 1:
|
if st != 1:
|
||||||
break
|
break
|
||||||
|
assert orth == lexeme.orth
|
||||||
|
py_str = self.strings[orth]
|
||||||
|
key = hash_string(py_str)
|
||||||
self._map.set(key, lexeme)
|
self._map.set(key, lexeme)
|
||||||
while self.lexemes.size() < (lexeme.id + 1):
|
while self.lexemes.size() < (lexeme.id + 1):
|
||||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||||
|
|
Loading…
Reference in New Issue