diff --git a/spacy/lang.pyx b/spacy/lang.pyx index f6abf4aee..35d1838b2 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -15,7 +15,7 @@ import re from .util import read_lang_data from spacy.tokens import Tokens -from spacy.lexeme cimport LexemeC, lexeme_init +from spacy.lexeme cimport LexemeC, lexeme_init, lexeme_pack, lexeme_unpack from murmurhash.mrmr cimport hash64 from cpython.ref cimport Py_INCREF @@ -25,7 +25,6 @@ from cymem.cymem cimport Pool from cython.operator cimport preincrement as preinc from cython.operator cimport dereference as deref - from preshed.maps cimport PreshMap from spacy import orth from spacy import util @@ -69,7 +68,6 @@ cdef enum Views: View_N - # Assign the flag and view functions by enum value. # This is verbose, but it ensures we don't get nasty order sensitivities. STRING_VIEW_FUNCS = [None] * View_N @@ -107,8 +105,6 @@ FLAG_FUNCS[Flag_OftTitle] = orth.oft_case('title', 0.7) FLAG_FUNCS[Flag_OftUpper] = orth.oft_case('upper', 0.7) - - cdef class Language: """Base class for language-specific tokenizers. @@ -127,23 +123,19 @@ cdef class Language: fl_is_digit = Flag_IsDigit v_shape = View_WordShape - def __cinit__(self, name, user_string_features, user_flag_features): + def __init__(self, name, user_string_features, user_flag_features): self.name = name self._mem = Pool() self.cache = PreshMap(2 ** 25) self.specials = PreshMap(2 ** 16) - lang_data = util.read_lang_data(name) - rules, prefix, suffix, words, probs, clusters, case_stats, tag_stats = lang_data + rules, prefix, suffix, lexemes = util.read_lang_data(name) self.prefix_re = re.compile(prefix) self.suffix_re = re.compile(suffix) - self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats, + self.lexicon = Lexicon(lexemes, STRING_VIEW_FUNCS + user_string_features, FLAG_FUNCS + user_flag_features) self._load_special_tokenization(rules) - def __dealloc__(self): - pass - property nr_types: def __get__(self): """Return the number of lexical types in the vocabulary""" @@ -347,27 +339,20 @@ cdef class Language: cdef class Lexicon: - def __cinit__(self, words, probs, clusters, case_stats, tag_stats, - string_features, flag_features): + def __cinit__(self, lexemes, string_features, flag_features): self._mem = Pool() self._flag_features = flag_features self._string_features = string_features self._dict = PreshMap(2 ** 20) self.size = 0 cdef String string - for uni_string in words: - prob = probs.get(uni_string, 0.0) - cluster = clusters.get(uni_string, 0.0) - cases = case_stats.get(uni_string, {}) - tags = tag_stats.get(uni_string, {}) - views = [string_view(uni_string, prob, cluster, cases, tags) - for string_view in self._string_features] - flags = set() - for i, flag_feature in enumerate(self._flag_features): - if flag_feature(uni_string, prob, cluster, cases, tags): - flags.add(i) - lexeme = lexeme_init(self._mem, self.size, uni_string, prob, cluster, views, flags) - string_from_unicode(&string, uni_string) + cdef dict lexeme_dict + cdef LexemeC* lexeme + for lexeme_dict in lexemes: + string_from_unicode(&string, lexeme_dict['string']) + lexeme = self._mem.alloc(1, sizeof(LexemeC)) + lexeme.views = self._mem.alloc(len(string_features), sizeof(char*)) + lexeme_unpack(lexeme, lexeme_dict) self._dict.set(string.key, lexeme) self.size += 1 diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 6a249bf07..f45c581f2 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -22,3 +22,4 @@ cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id) cdef dict lexeme_pack(LexemeC* lexeme) +cdef int lexeme_unpack(LexemeC* lexeme, dict p) except -1 diff --git a/spacy/util.py b/spacy/util.py index 229dc81a4..15c03780a 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -16,18 +16,16 @@ def read_lang_data(name): tokenization = read_tokenization(data_dir) prefix = read_prefix(data_dir) suffix = read_suffix(data_dir) - words = load_resource(data_dir, 'words') - probs = load_resource(data_dir, 'probs') - clusters = load_resource(data_dir, 'clusters') - case_stats = load_resource(data_dir, 'case_stats') - tag_stats = load_resource(data_dir, 'tag_stats') - return tokenization, prefix, suffix, words, probs, clusters, case_stats, tag_stats + + lex_loc = path.join(data_dir, 'lexemes.json') + if path.exists(lex_loc): + with open(lex_loc) as file_: + lexemes = ujson.load(file_) + else: + lexemes = [] + return tokenization, prefix, suffix, lexemes -def load_resource(data_dir, name): - loc = path.join(data_dir, name + '.json') - return json.load(loc) if path.exists(loc) else {} - def read_prefix(data_dir): with utf8open(path.join(data_dir, 'prefix')) as file_: entries = file_.read().split('\n')