From e40caae51fd497725acf6e6d78f8c8a1ee727f8d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Thu, 9 Oct 2014 14:51:35 +1100
Subject: [PATCH] * Update Lexicon class to expect a list of lexeme dict
 descriptions

---
 spacy/lang.pyx   | 39 ++++++++++++---------------------------
 spacy/lexeme.pxd |  1 +
 spacy/util.py    | 18 ++++++++----------
 3 files changed, 21 insertions(+), 37 deletions(-)

diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index f6abf4aee..35d1838b2 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -15,7 +15,7 @@ import re
 
 from .util import read_lang_data
 from spacy.tokens import Tokens
-from spacy.lexeme cimport LexemeC, lexeme_init
+from spacy.lexeme cimport LexemeC, lexeme_init, lexeme_pack, lexeme_unpack
 from murmurhash.mrmr cimport hash64
 
 from cpython.ref cimport Py_INCREF
@@ -25,7 +25,6 @@ from cymem.cymem cimport Pool
 from cython.operator cimport preincrement as preinc
 from cython.operator cimport dereference as deref
 
-
 from preshed.maps cimport PreshMap
 from spacy import orth
 from spacy import util
@@ -69,7 +68,6 @@ cdef enum Views:
     View_N
 
 
-
 # Assign the flag and view functions by enum value.
 # This is verbose, but it ensures we don't get nasty order sensitivities.
 STRING_VIEW_FUNCS = [None] * View_N
@@ -107,8 +105,6 @@ FLAG_FUNCS[Flag_OftTitle] = orth.oft_case('title', 0.7)
 FLAG_FUNCS[Flag_OftUpper] = orth.oft_case('upper', 0.7)
 
 
-
-
 cdef class Language:
     """Base class for language-specific tokenizers.
 
@@ -127,23 +123,19 @@ cdef class Language:
     fl_is_digit = Flag_IsDigit
     v_shape = View_WordShape
 
-    def __cinit__(self, name, user_string_features, user_flag_features):
+    def __init__(self, name, user_string_features, user_flag_features):
         self.name = name
         self._mem = Pool()
         self.cache = PreshMap(2 ** 25)
         self.specials = PreshMap(2 ** 16)
-        lang_data = util.read_lang_data(name)
-        rules, prefix, suffix, words, probs, clusters, case_stats, tag_stats = lang_data
+        rules, prefix, suffix, lexemes = util.read_lang_data(name)
         self.prefix_re = re.compile(prefix)
         self.suffix_re = re.compile(suffix)
-        self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
+        self.lexicon = Lexicon(lexemes,
                                STRING_VIEW_FUNCS + user_string_features,
                                FLAG_FUNCS + user_flag_features)
         self._load_special_tokenization(rules)
 
-    def __dealloc__(self):
-        pass
-
     property nr_types:
         def __get__(self):
             """Return the number of lexical types in the vocabulary"""
@@ -347,27 +339,20 @@ cdef class Language:
 
 
 cdef class Lexicon:
-    def __cinit__(self, words, probs, clusters, case_stats, tag_stats,
-                  string_features, flag_features):
+    def __cinit__(self, lexemes, string_features, flag_features):
         self._mem = Pool()
         self._flag_features = flag_features
         self._string_features = string_features
         self._dict = PreshMap(2 ** 20)
         self.size = 0
         cdef String string
-        for uni_string in words:
-            prob = probs.get(uni_string, 0.0)
-            cluster = clusters.get(uni_string, 0.0)
-            cases = case_stats.get(uni_string, {})
-            tags = tag_stats.get(uni_string, {})
-            views = [string_view(uni_string, prob, cluster, cases, tags)
-                     for string_view in self._string_features]
-            flags = set()
-            for i, flag_feature in enumerate(self._flag_features):
-                if flag_feature(uni_string, prob, cluster, cases, tags):
-                    flags.add(i)
-            lexeme = lexeme_init(self._mem, self.size, uni_string, prob, cluster, views, flags)
-            string_from_unicode(&string, uni_string)
+        cdef dict lexeme_dict
+        cdef LexemeC* lexeme
+        for lexeme_dict in lexemes:
+            string_from_unicode(&string, lexeme_dict['string'])
+            lexeme = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
+            lexeme.views = <char**>self._mem.alloc(len(string_features), sizeof(char*))
+            lexeme_unpack(lexeme, lexeme_dict)
             self._dict.set(string.key, lexeme)
             self.size += 1
 
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 6a249bf07..f45c581f2 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -22,3 +22,4 @@ cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id)
 
 
 cdef dict lexeme_pack(LexemeC* lexeme)
+cdef int lexeme_unpack(LexemeC* lexeme, dict p) except -1
diff --git a/spacy/util.py b/spacy/util.py
index 229dc81a4..15c03780a 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -16,18 +16,16 @@ def read_lang_data(name):
     tokenization = read_tokenization(data_dir)
     prefix = read_prefix(data_dir)
     suffix = read_suffix(data_dir)
-    words = load_resource(data_dir, 'words')
-    probs = load_resource(data_dir, 'probs')
-    clusters = load_resource(data_dir, 'clusters')
-    case_stats = load_resource(data_dir, 'case_stats')
-    tag_stats = load_resource(data_dir, 'tag_stats')
-    return tokenization, prefix, suffix, words, probs, clusters, case_stats, tag_stats
+    
+    lex_loc = path.join(data_dir, 'lexemes.json')
+    if path.exists(lex_loc):
+        with open(lex_loc) as file_:
+            lexemes = ujson.load(file_)
+    else:
+        lexemes = []
+    return tokenization, prefix, suffix, lexemes
 
 
-def load_resource(data_dir, name):
-    loc = path.join(data_dir, name + '.json')
-    return json.load(loc) if path.exists(loc) else {}
-
 def read_prefix(data_dir):
     with  utf8open(path.join(data_dir, 'prefix')) as file_:
         entries = file_.read().split('\n')