* Update Lexicon class to expect a list of lexeme dict descriptions

This commit is contained in:
Matthew Honnibal 2014-10-09 14:51:35 +11:00
parent 51d75b244b
commit e40caae51f
3 changed files with 21 additions and 37 deletions

View File

@ -15,7 +15,7 @@ import re
from .util import read_lang_data from .util import read_lang_data
from spacy.tokens import Tokens from spacy.tokens import Tokens
from spacy.lexeme cimport LexemeC, lexeme_init from spacy.lexeme cimport LexemeC, lexeme_init, lexeme_pack, lexeme_unpack
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from cpython.ref cimport Py_INCREF from cpython.ref cimport Py_INCREF
@ -25,7 +25,6 @@ from cymem.cymem cimport Pool
from cython.operator cimport preincrement as preinc from cython.operator cimport preincrement as preinc
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from spacy import orth from spacy import orth
from spacy import util from spacy import util
@ -69,7 +68,6 @@ cdef enum Views:
View_N View_N
# Assign the flag and view functions by enum value. # Assign the flag and view functions by enum value.
# This is verbose, but it ensures we don't get nasty order sensitivities. # This is verbose, but it ensures we don't get nasty order sensitivities.
STRING_VIEW_FUNCS = [None] * View_N STRING_VIEW_FUNCS = [None] * View_N
@ -107,8 +105,6 @@ FLAG_FUNCS[Flag_OftTitle] = orth.oft_case('title', 0.7)
FLAG_FUNCS[Flag_OftUpper] = orth.oft_case('upper', 0.7) FLAG_FUNCS[Flag_OftUpper] = orth.oft_case('upper', 0.7)
cdef class Language: cdef class Language:
"""Base class for language-specific tokenizers. """Base class for language-specific tokenizers.
@ -127,23 +123,19 @@ cdef class Language:
fl_is_digit = Flag_IsDigit fl_is_digit = Flag_IsDigit
v_shape = View_WordShape v_shape = View_WordShape
def __cinit__(self, name, user_string_features, user_flag_features): def __init__(self, name, user_string_features, user_flag_features):
self.name = name self.name = name
self._mem = Pool() self._mem = Pool()
self.cache = PreshMap(2 ** 25) self.cache = PreshMap(2 ** 25)
self.specials = PreshMap(2 ** 16) self.specials = PreshMap(2 ** 16)
lang_data = util.read_lang_data(name) rules, prefix, suffix, lexemes = util.read_lang_data(name)
rules, prefix, suffix, words, probs, clusters, case_stats, tag_stats = lang_data
self.prefix_re = re.compile(prefix) self.prefix_re = re.compile(prefix)
self.suffix_re = re.compile(suffix) self.suffix_re = re.compile(suffix)
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats, self.lexicon = Lexicon(lexemes,
STRING_VIEW_FUNCS + user_string_features, STRING_VIEW_FUNCS + user_string_features,
FLAG_FUNCS + user_flag_features) FLAG_FUNCS + user_flag_features)
self._load_special_tokenization(rules) self._load_special_tokenization(rules)
def __dealloc__(self):
pass
property nr_types: property nr_types:
def __get__(self): def __get__(self):
"""Return the number of lexical types in the vocabulary""" """Return the number of lexical types in the vocabulary"""
@ -347,27 +339,20 @@ cdef class Language:
cdef class Lexicon: cdef class Lexicon:
def __cinit__(self, words, probs, clusters, case_stats, tag_stats, def __cinit__(self, lexemes, string_features, flag_features):
string_features, flag_features):
self._mem = Pool() self._mem = Pool()
self._flag_features = flag_features self._flag_features = flag_features
self._string_features = string_features self._string_features = string_features
self._dict = PreshMap(2 ** 20) self._dict = PreshMap(2 ** 20)
self.size = 0 self.size = 0
cdef String string cdef String string
for uni_string in words: cdef dict lexeme_dict
prob = probs.get(uni_string, 0.0) cdef LexemeC* lexeme
cluster = clusters.get(uni_string, 0.0) for lexeme_dict in lexemes:
cases = case_stats.get(uni_string, {}) string_from_unicode(&string, lexeme_dict['string'])
tags = tag_stats.get(uni_string, {}) lexeme = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
views = [string_view(uni_string, prob, cluster, cases, tags) lexeme.views = <char**>self._mem.alloc(len(string_features), sizeof(char*))
for string_view in self._string_features] lexeme_unpack(lexeme, lexeme_dict)
flags = set()
for i, flag_feature in enumerate(self._flag_features):
if flag_feature(uni_string, prob, cluster, cases, tags):
flags.add(i)
lexeme = lexeme_init(self._mem, self.size, uni_string, prob, cluster, views, flags)
string_from_unicode(&string, uni_string)
self._dict.set(string.key, lexeme) self._dict.set(string.key, lexeme)
self.size += 1 self.size += 1

View File

@ -22,3 +22,4 @@ cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id)
cdef dict lexeme_pack(LexemeC* lexeme) cdef dict lexeme_pack(LexemeC* lexeme)
cdef int lexeme_unpack(LexemeC* lexeme, dict p) except -1

View File

@ -16,17 +16,15 @@ def read_lang_data(name):
tokenization = read_tokenization(data_dir) tokenization = read_tokenization(data_dir)
prefix = read_prefix(data_dir) prefix = read_prefix(data_dir)
suffix = read_suffix(data_dir) suffix = read_suffix(data_dir)
words = load_resource(data_dir, 'words')
probs = load_resource(data_dir, 'probs')
clusters = load_resource(data_dir, 'clusters')
case_stats = load_resource(data_dir, 'case_stats')
tag_stats = load_resource(data_dir, 'tag_stats')
return tokenization, prefix, suffix, words, probs, clusters, case_stats, tag_stats
lex_loc = path.join(data_dir, 'lexemes.json')
if path.exists(lex_loc):
with open(lex_loc) as file_:
lexemes = ujson.load(file_)
else:
lexemes = []
return tokenization, prefix, suffix, lexemes
def load_resource(data_dir, name):
loc = path.join(data_dir, name + '.json')
return json.load(loc) if path.exists(loc) else {}
def read_prefix(data_dir): def read_prefix(data_dir):
with utf8open(path.join(data_dir, 'prefix')) as file_: with utf8open(path.join(data_dir, 'prefix')) as file_: