* Fix platform-specific lexicon bug.

2015-01-31 16:38:58 +11:00 · 2015-01-31 16:38:58 +11:00 · ce3ae8b5d9
parent a1ed574b7b
commit ce3ae8b5d9
1 changed files with 12 additions and 5 deletions
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -12,6 +12,7 @@ from .lexeme cimport Lexeme
 from .strings cimport slice_unicode
 from .strings cimport hash_string
 from .orth cimport word_shape
 from .typedefs cimport attr_t
 from cymem.cymem cimport Address
@ -41,8 +42,8 @@ cdef class Vocab:
        if data_dir is not None:
            if not path.isdir(data_dir):
                raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
-            self.strings.load(path.join(data_dir, 'strings.txt'))
+            self.load_lexemes(path.join(data_dir, 'strings.txt'),
-            self.load_lexemes(path.join(data_dir, 'lexemes.bin'))
+                              path.join(data_dir, 'lexemes.bin'))
            if path.exists(path.join(data_dir, 'vec.bin')):
                self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
@ -129,14 +130,15 @@ cdef class Vocab:
            if key == 0:
                continue
            lexeme = <LexemeC*>self._map.c_map.cells[i].value
-            st = fwrite(&key, sizeof(key), 1, fp)
+            st = fwrite(&lexeme.orth, sizeof(lexeme.orth), 1, fp)
            assert st == 1
            st = fwrite(lexeme, sizeof(LexemeC), 1, fp)
            assert st == 1
        st = fclose(fp)
        assert st == 0
-    def load_lexemes(self, loc):
+    def load_lexemes(self, strings_loc, loc):
        self.strings.load(strings_loc)
        if not path.exists(loc):
            raise IOError('LexemeCs file not found at %s' % loc)
        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
@ -144,10 +146,12 @@ cdef class Vocab:
        assert fp != NULL
        cdef size_t st
        cdef LexemeC* lexeme
        cdef attr_t orth
        cdef hash_t key
        cdef unicode py_str
        i = 0
        while True:
-            st = fread(&key, sizeof(key), 1, fp)
+            st = fread(&orth, sizeof(orth), 1, fp)
            if st != 1:
                break
            lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
@ -156,6 +160,9 @@ cdef class Vocab:
            lexeme.repvec = EMPTY_VEC
            if st != 1:
                break
            assert orth == lexeme.orth
            py_str = self.strings[orth]
            key = hash_string(py_str)
            self._map.set(key, lexeme)
            while self.lexemes.size() < (lexeme.id + 1):
                self.lexemes.push_back(&EMPTY_LEXEME)