diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 964785bbe..879f3378a 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -12,6 +12,7 @@ from .lexeme cimport Lexeme from .strings cimport slice_unicode from .strings cimport hash_string from .orth cimport word_shape +from .typedefs cimport attr_t from cymem.cymem cimport Address @@ -41,8 +42,8 @@ cdef class Vocab: if data_dir is not None: if not path.isdir(data_dir): raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) - self.strings.load(path.join(data_dir, 'strings.txt')) - self.load_lexemes(path.join(data_dir, 'lexemes.bin')) + self.load_lexemes(path.join(data_dir, 'strings.txt'), + path.join(data_dir, 'lexemes.bin')) if path.exists(path.join(data_dir, 'vec.bin')): self.load_rep_vectors(path.join(data_dir, 'vec.bin')) @@ -129,14 +130,15 @@ cdef class Vocab: if key == 0: continue lexeme = self._map.c_map.cells[i].value - st = fwrite(&key, sizeof(key), 1, fp) + st = fwrite(&lexeme.orth, sizeof(lexeme.orth), 1, fp) assert st == 1 st = fwrite(lexeme, sizeof(LexemeC), 1, fp) assert st == 1 st = fclose(fp) assert st == 0 - def load_lexemes(self, loc): + def load_lexemes(self, strings_loc, loc): + self.strings.load(strings_loc) if not path.exists(loc): raise IOError('LexemeCs file not found at %s' % loc) cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc @@ -144,10 +146,12 @@ cdef class Vocab: assert fp != NULL cdef size_t st cdef LexemeC* lexeme + cdef attr_t orth cdef hash_t key + cdef unicode py_str i = 0 while True: - st = fread(&key, sizeof(key), 1, fp) + st = fread(&orth, sizeof(orth), 1, fp) if st != 1: break lexeme = self.mem.alloc(sizeof(LexemeC), 1) @@ -156,6 +160,9 @@ cdef class Vocab: lexeme.repvec = EMPTY_VEC if st != 1: break + assert orth == lexeme.orth + py_str = self.strings[orth] + key = hash_string(py_str) self._map.set(key, lexeme) while self.lexemes.size() < (lexeme.id + 1): self.lexemes.push_back(&EMPTY_LEXEME)