From 6b586cdad4ce236bfbedf38a33999687121abfc9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 27 Jul 2015 08:31:51 +0200 Subject: [PATCH] * Change lexemes.bin format. Add a header specifying size of LexemeC and number of lexemes, and don't have the redundant orth information. --- spacy/vocab.pyx | 59 +++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 31 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ac2e11e11..9a29179d4 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -3,7 +3,7 @@ from __future__ import unicode_literals from libc.stdio cimport fopen, fclose, fread, fwrite, FILE from libc.string cimport memset -from libc.stdint cimport int32_t +from libc.stdint cimport int32_t, uint64_t import bz2 from os import path @@ -186,12 +186,17 @@ cdef class Vocab: cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc cdef CFile fp = CFile(bytes_loc, 'wb') - cdef size_t st + + cdef uint64_t size_of_lexeme = sizeof(LexemeC) + items = list(self._by_hash.items()) + cdef uint64_t n_lexemes = len(items) + fp.write_from(&size_of_lexeme, 1, sizeof(size_of_lexeme)) + fp.write_from(&n_lexemes, 1, sizeof(n_lexemes)) + cdef size_t addr cdef hash_t key - for key, addr in self._by_hash.items(): + for key, addr in items: lexeme = addr - fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1) fp.write_from(lexeme, sizeof(LexemeC), 1) fp.close() @@ -199,36 +204,28 @@ cdef class Vocab: self.strings.load(strings_loc) if not path.exists(loc): raise IOError('LexemeCs file not found at %s' % loc) - cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc - cdef FILE* fp = fopen(bytes_loc, b'rb') - if fp == NULL: - raise IOError('lexemes data file present, but cannot open from ' % loc) - cdef size_t st - cdef LexemeC* lexeme - cdef attr_t orth + + cdef CFile fp = CFile(loc, 'rb') + + cdef uint64_t size_of_lexeme + cdef uint64_t n_lexemes + fp.read_into(&size_of_lexeme, 1, sizeof(size_of_lexeme)) + assert size_of_lexeme == sizeof(LexemeC) + fp.read_into(&n_lexemes, 1, sizeof(n_lexemes)) + + lexemes = self.mem.alloc(n_lexemes, sizeof(LexemeC)) cdef hash_t key cdef unicode py_str - i = 0 - while True: - st = fread(&orth, sizeof(orth), 1, fp) - if st != 1: - break - lexeme = self.mem.alloc(sizeof(LexemeC), 1) - # Copies data from the file into the lexeme - st = fread(lexeme, sizeof(LexemeC), 1, fp) - lexeme.repvec = EMPTY_VEC - if st != 1: - break - if orth != lexeme.orth: - # TODO: Improve this error message, pending resolution to Issue #64 - raise IOError('Error reading from lexemes.bin. Integrity check fails.') - py_str = self.strings[orth] + cdef int i + self.length = n_lexemes + for i in range(n_lexemes): + fp.read_into(&lexemes[i], sizeof(LexemeC), 1) + lexemes[i].repvec = EMPTY_VEC + py_str = self.strings[lexemes[i].orth] key = hash_string(py_str) - self._by_hash.set(key, lexeme) - self._by_orth.set(lexeme.orth, lexeme) - self.length += 1 - i += 1 - fclose(fp) + self._by_hash.set(key, &lexemes[i]) + self._by_orth.set(lexemes[i].orth, &lexemes[i]) + assert lexemes[i].length == len(py_str) def load_rep_vectors(self, loc): cdef CFile file_ = CFile(loc, b'rb')