From 67c8c8019fa1253a262f7c12443ea3bc61c96e12 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Thu, 30 Oct 2014 01:01:00 +1100
Subject: [PATCH] * Update lexeme serialization, using a binary file format

---
 setup.py             |  5 ++---
 spacy/lang.pyx       | 16 +++++++++++++---
 spacy/lexeme.pyx     |  1 -
 spacy/utf8string.pyx |  2 ++
 spacy/util.py        |  2 +-
 5 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/setup.py b/setup.py
index 40fae269f..397091403 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,6 @@ import distutils.core
 import sys
 import os
 import os.path
-import numpy
 
 from os import path
 from glob import glob
@@ -35,7 +34,7 @@ compile_args = []
 link_args = []
 libs = []
 
-includes = ['.', numpy.get_include()]
+includes = ['.']
 cython_includes = ['.']
 
 
@@ -48,11 +47,11 @@ else:
 
 exts = [
     Extension("spacy.lang", ["spacy/lang.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.word", ["spacy/word.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.en", ["spacy/en.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes),
     Extension("spacy.pos", ["spacy/pos.pyx"], language="c++", include_dirs=includes),
+    Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
 ]
 
 
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 5b5892fdc..e01727313 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -45,6 +45,8 @@ cdef class Language:
         self.suffix_re = re.compile(suffix)
         self.infix_re = re.compile(infix)
         self.lexicon = Lexicon(lexemes)
+        self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
+        self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
         self._load_special_tokenization(rules)
 
     cpdef Tokens tokenize(self, unicode string):
@@ -244,6 +246,13 @@ cdef class Lexicon:
             self.lexemes.push_back(lexeme)
             self.size += 1
 
+    def set(self, unicode py_string, dict lexeme_dict):
+        cdef String string
+        string_from_unicode(&string, py_string)
+        cdef Lexeme* lex = self.get(&string)
+        lex[0] = lexeme_init(string.chars[:string.n], string.key, lex.i,
+                             self.strings, lexeme_dict)
+
     cdef Lexeme* get(self, String* string) except NULL:
         cdef Lexeme* lex
         lex = <Lexeme*>self._dict.get(string.key)
@@ -278,7 +287,7 @@ cdef class Lexicon:
         cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
         assert fp != NULL
         cdef size_t st
-        for i in range(self.size):
+        for i in range(self.size-1):
             st = fwrite(self.lexemes[i], sizeof(Lexeme), 1, fp)
             assert st == 1
         st = fclose(fp)
@@ -293,11 +302,12 @@ cdef class Lexicon:
         cdef Lexeme* lexeme
         while True:
             lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
-            st = fread(lexeme, sizeof(lexeme), 1, fp)
-            if st == 0:
+            st = fread(lexeme, sizeof(Lexeme), 1, fp)
+            if st != 1:
                 break
             self.lexemes.push_back(lexeme)
             self._dict.set(lexeme.hash, lexeme)
+        fclose(fp)
         
 
 cdef void string_from_unicode(String* s, unicode uni):
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 03c6e2270..887210225 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -31,7 +31,6 @@ cpdef Lexeme init(unicode string, hash_t hashed, atom_t i,
     cdef Lexeme lex
     lex.hash = hashed
     lex.i = i
-    print string, i
     lex.length = len(string)
     lex.sic = get_string_id(string, store)
     
diff --git a/spacy/utf8string.pyx b/spacy/utf8string.pyx
index 8cb2bebd2..07b92e5d6 100644
--- a/spacy/utf8string.pyx
+++ b/spacy/utf8string.pyx
@@ -58,10 +58,12 @@ cdef class StringStore:
         strings = []
         cdef Utf8Str* string
         cdef bytes py_string
+        print "Dump strings"
         for i in range(self.size):
             string = &self.strings[i]
             py_string = string.chars[:string.length]
             strings.append(py_string)
+        print len(strings)
         with open(loc, 'w') as file_:
             ujson.dump(strings, file_, ensure_ascii=False)
 
diff --git a/spacy/util.py b/spacy/util.py
index e68bac748..d06911400 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -13,7 +13,7 @@ def utf8open(loc, mode='r'):
 
 def read_lang_data(name):
     data_dir = path.join(DATA_DIR, name)
-    tokenization = read_tokenization(data_dir)
+    tokenization = read_tokenization(name)
     prefix = read_prefix(data_dir)
     suffix = read_suffix(data_dir)
     infix = read_infix(data_dir)