* Update lexeme serialization, using a binary file format

This commit is contained in:
Matthew Honnibal 2014-10-30 01:01:00 +11:00
parent 13909a2e24
commit 67c8c8019f
5 changed files with 18 additions and 8 deletions

View File

@ -6,7 +6,6 @@ import distutils.core
import sys import sys
import os import os
import os.path import os.path
import numpy
from os import path from os import path
from glob import glob from glob import glob
@ -35,7 +34,7 @@ compile_args = []
link_args = [] link_args = []
libs = [] libs = []
includes = ['.', numpy.get_include()] includes = ['.']
cython_includes = ['.'] cython_includes = ['.']
@ -48,11 +47,11 @@ else:
exts = [ exts = [
Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes), Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes),
Extension("spacy.word", ["spacy/word.pyx"], language="c++", include_dirs=includes),
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes), Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes), Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes),
Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes), Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
Extension("spacy.pos", ["spacy/pos.pyx"], language="c++", include_dirs=includes), Extension("spacy.pos", ["spacy/pos.pyx"], language="c++", include_dirs=includes),
Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
] ]

View File

@ -45,6 +45,8 @@ cdef class Language:
self.suffix_re = re.compile(suffix) self.suffix_re = re.compile(suffix)
self.infix_re = re.compile(infix) self.infix_re = re.compile(infix)
self.lexicon = Lexicon(lexemes) self.lexicon = Lexicon(lexemes)
self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
self._load_special_tokenization(rules) self._load_special_tokenization(rules)
cpdef Tokens tokenize(self, unicode string): cpdef Tokens tokenize(self, unicode string):
@ -244,6 +246,13 @@ cdef class Lexicon:
self.lexemes.push_back(lexeme) self.lexemes.push_back(lexeme)
self.size += 1 self.size += 1
def set(self, unicode py_string, dict lexeme_dict):
cdef String string
string_from_unicode(&string, py_string)
cdef Lexeme* lex = self.get(&string)
lex[0] = lexeme_init(string.chars[:string.n], string.key, lex.i,
self.strings, lexeme_dict)
cdef Lexeme* get(self, String* string) except NULL: cdef Lexeme* get(self, String* string) except NULL:
cdef Lexeme* lex cdef Lexeme* lex
lex = <Lexeme*>self._dict.get(string.key) lex = <Lexeme*>self._dict.get(string.key)
@ -278,7 +287,7 @@ cdef class Lexicon:
cdef FILE* fp = fopen(<char*>bytes_loc, 'wb') cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
assert fp != NULL assert fp != NULL
cdef size_t st cdef size_t st
for i in range(self.size): for i in range(self.size-1):
st = fwrite(self.lexemes[i], sizeof(Lexeme), 1, fp) st = fwrite(self.lexemes[i], sizeof(Lexeme), 1, fp)
assert st == 1 assert st == 1
st = fclose(fp) st = fclose(fp)
@ -293,11 +302,12 @@ cdef class Lexicon:
cdef Lexeme* lexeme cdef Lexeme* lexeme
while True: while True:
lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1) lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
st = fread(lexeme, sizeof(lexeme), 1, fp) st = fread(lexeme, sizeof(Lexeme), 1, fp)
if st == 0: if st != 1:
break break
self.lexemes.push_back(lexeme) self.lexemes.push_back(lexeme)
self._dict.set(lexeme.hash, lexeme) self._dict.set(lexeme.hash, lexeme)
fclose(fp)
cdef void string_from_unicode(String* s, unicode uni): cdef void string_from_unicode(String* s, unicode uni):

View File

@ -31,7 +31,6 @@ cpdef Lexeme init(unicode string, hash_t hashed, atom_t i,
cdef Lexeme lex cdef Lexeme lex
lex.hash = hashed lex.hash = hashed
lex.i = i lex.i = i
print string, i
lex.length = len(string) lex.length = len(string)
lex.sic = get_string_id(string, store) lex.sic = get_string_id(string, store)

View File

@ -58,10 +58,12 @@ cdef class StringStore:
strings = [] strings = []
cdef Utf8Str* string cdef Utf8Str* string
cdef bytes py_string cdef bytes py_string
print "Dump strings"
for i in range(self.size): for i in range(self.size):
string = &self.strings[i] string = &self.strings[i]
py_string = string.chars[:string.length] py_string = string.chars[:string.length]
strings.append(py_string) strings.append(py_string)
print len(strings)
with open(loc, 'w') as file_: with open(loc, 'w') as file_:
ujson.dump(strings, file_, ensure_ascii=False) ujson.dump(strings, file_, ensure_ascii=False)

View File

@ -13,7 +13,7 @@ def utf8open(loc, mode='r'):
def read_lang_data(name): def read_lang_data(name):
data_dir = path.join(DATA_DIR, name) data_dir = path.join(DATA_DIR, name)
tokenization = read_tokenization(data_dir) tokenization = read_tokenization(name)
prefix = read_prefix(data_dir) prefix = read_prefix(data_dir)
suffix = read_suffix(data_dir) suffix = read_suffix(data_dir)
infix = read_infix(data_dir) infix = read_infix(data_dir)