From 67c8c8019fa1253a262f7c12443ea3bc61c96e12 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 30 Oct 2014 01:01:00 +1100 Subject: [PATCH] * Update lexeme serialization, using a binary file format --- setup.py | 5 ++--- spacy/lang.pyx | 16 +++++++++++++--- spacy/lexeme.pyx | 1 - spacy/utf8string.pyx | 2 ++ spacy/util.py | 2 +- 5 files changed, 18 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index 40fae269f..397091403 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,6 @@ import distutils.core import sys import os import os.path -import numpy from os import path from glob import glob @@ -35,7 +34,7 @@ compile_args = [] link_args = [] libs = [] -includes = ['.', numpy.get_include()] +includes = ['.'] cython_includes = ['.'] @@ -48,11 +47,11 @@ else: exts = [ Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes), - Extension("spacy.word", ["spacy/word.pyx"], language="c++", include_dirs=includes), Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes), Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes), Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes), Extension("spacy.pos", ["spacy/pos.pyx"], language="c++", include_dirs=includes), + Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes), ] diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 5b5892fdc..e01727313 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -45,6 +45,8 @@ cdef class Language: self.suffix_re = re.compile(suffix) self.infix_re = re.compile(infix) self.lexicon = Lexicon(lexemes) + self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes')) + self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings')) self._load_special_tokenization(rules) cpdef Tokens tokenize(self, unicode string): @@ -244,6 +246,13 @@ cdef class Lexicon: self.lexemes.push_back(lexeme) self.size += 1 + def set(self, unicode py_string, dict lexeme_dict): + cdef String string + string_from_unicode(&string, py_string) + cdef Lexeme* lex = self.get(&string) + lex[0] = lexeme_init(string.chars[:string.n], string.key, lex.i, + self.strings, lexeme_dict) + cdef Lexeme* get(self, String* string) except NULL: cdef Lexeme* lex lex = self._dict.get(string.key) @@ -278,7 +287,7 @@ cdef class Lexicon: cdef FILE* fp = fopen(bytes_loc, 'wb') assert fp != NULL cdef size_t st - for i in range(self.size): + for i in range(self.size-1): st = fwrite(self.lexemes[i], sizeof(Lexeme), 1, fp) assert st == 1 st = fclose(fp) @@ -293,11 +302,12 @@ cdef class Lexicon: cdef Lexeme* lexeme while True: lexeme = self.mem.alloc(sizeof(Lexeme), 1) - st = fread(lexeme, sizeof(lexeme), 1, fp) - if st == 0: + st = fread(lexeme, sizeof(Lexeme), 1, fp) + if st != 1: break self.lexemes.push_back(lexeme) self._dict.set(lexeme.hash, lexeme) + fclose(fp) cdef void string_from_unicode(String* s, unicode uni): diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 03c6e2270..887210225 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -31,7 +31,6 @@ cpdef Lexeme init(unicode string, hash_t hashed, atom_t i, cdef Lexeme lex lex.hash = hashed lex.i = i - print string, i lex.length = len(string) lex.sic = get_string_id(string, store) diff --git a/spacy/utf8string.pyx b/spacy/utf8string.pyx index 8cb2bebd2..07b92e5d6 100644 --- a/spacy/utf8string.pyx +++ b/spacy/utf8string.pyx @@ -58,10 +58,12 @@ cdef class StringStore: strings = [] cdef Utf8Str* string cdef bytes py_string + print "Dump strings" for i in range(self.size): string = &self.strings[i] py_string = string.chars[:string.length] strings.append(py_string) + print len(strings) with open(loc, 'w') as file_: ujson.dump(strings, file_, ensure_ascii=False) diff --git a/spacy/util.py b/spacy/util.py index e68bac748..d06911400 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -13,7 +13,7 @@ def utf8open(loc, mode='r'): def read_lang_data(name): data_dir = path.join(DATA_DIR, name) - tokenization = read_tokenization(data_dir) + tokenization = read_tokenization(name) prefix = read_prefix(data_dir) suffix = read_suffix(data_dir) infix = read_infix(data_dir)