mirror of https://github.com/explosion/spaCy.git
* Update Lexicon class to expect a list of lexeme dict descriptions
This commit is contained in:
parent
51d75b244b
commit
e40caae51f
|
@ -15,7 +15,7 @@ import re
|
||||||
|
|
||||||
from .util import read_lang_data
|
from .util import read_lang_data
|
||||||
from spacy.tokens import Tokens
|
from spacy.tokens import Tokens
|
||||||
from spacy.lexeme cimport LexemeC, lexeme_init
|
from spacy.lexeme cimport LexemeC, lexeme_init, lexeme_pack, lexeme_unpack
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
from cpython.ref cimport Py_INCREF
|
from cpython.ref cimport Py_INCREF
|
||||||
|
@ -25,7 +25,6 @@ from cymem.cymem cimport Pool
|
||||||
from cython.operator cimport preincrement as preinc
|
from cython.operator cimport preincrement as preinc
|
||||||
from cython.operator cimport dereference as deref
|
from cython.operator cimport dereference as deref
|
||||||
|
|
||||||
|
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from spacy import orth
|
from spacy import orth
|
||||||
from spacy import util
|
from spacy import util
|
||||||
|
@ -69,7 +68,6 @@ cdef enum Views:
|
||||||
View_N
|
View_N
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Assign the flag and view functions by enum value.
|
# Assign the flag and view functions by enum value.
|
||||||
# This is verbose, but it ensures we don't get nasty order sensitivities.
|
# This is verbose, but it ensures we don't get nasty order sensitivities.
|
||||||
STRING_VIEW_FUNCS = [None] * View_N
|
STRING_VIEW_FUNCS = [None] * View_N
|
||||||
|
@ -107,8 +105,6 @@ FLAG_FUNCS[Flag_OftTitle] = orth.oft_case('title', 0.7)
|
||||||
FLAG_FUNCS[Flag_OftUpper] = orth.oft_case('upper', 0.7)
|
FLAG_FUNCS[Flag_OftUpper] = orth.oft_case('upper', 0.7)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
"""Base class for language-specific tokenizers.
|
"""Base class for language-specific tokenizers.
|
||||||
|
|
||||||
|
@ -127,23 +123,19 @@ cdef class Language:
|
||||||
fl_is_digit = Flag_IsDigit
|
fl_is_digit = Flag_IsDigit
|
||||||
v_shape = View_WordShape
|
v_shape = View_WordShape
|
||||||
|
|
||||||
def __cinit__(self, name, user_string_features, user_flag_features):
|
def __init__(self, name, user_string_features, user_flag_features):
|
||||||
self.name = name
|
self.name = name
|
||||||
self._mem = Pool()
|
self._mem = Pool()
|
||||||
self.cache = PreshMap(2 ** 25)
|
self.cache = PreshMap(2 ** 25)
|
||||||
self.specials = PreshMap(2 ** 16)
|
self.specials = PreshMap(2 ** 16)
|
||||||
lang_data = util.read_lang_data(name)
|
rules, prefix, suffix, lexemes = util.read_lang_data(name)
|
||||||
rules, prefix, suffix, words, probs, clusters, case_stats, tag_stats = lang_data
|
|
||||||
self.prefix_re = re.compile(prefix)
|
self.prefix_re = re.compile(prefix)
|
||||||
self.suffix_re = re.compile(suffix)
|
self.suffix_re = re.compile(suffix)
|
||||||
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
|
self.lexicon = Lexicon(lexemes,
|
||||||
STRING_VIEW_FUNCS + user_string_features,
|
STRING_VIEW_FUNCS + user_string_features,
|
||||||
FLAG_FUNCS + user_flag_features)
|
FLAG_FUNCS + user_flag_features)
|
||||||
self._load_special_tokenization(rules)
|
self._load_special_tokenization(rules)
|
||||||
|
|
||||||
def __dealloc__(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
property nr_types:
|
property nr_types:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
"""Return the number of lexical types in the vocabulary"""
|
"""Return the number of lexical types in the vocabulary"""
|
||||||
|
@ -347,27 +339,20 @@ cdef class Language:
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
def __cinit__(self, words, probs, clusters, case_stats, tag_stats,
|
def __cinit__(self, lexemes, string_features, flag_features):
|
||||||
string_features, flag_features):
|
|
||||||
self._mem = Pool()
|
self._mem = Pool()
|
||||||
self._flag_features = flag_features
|
self._flag_features = flag_features
|
||||||
self._string_features = string_features
|
self._string_features = string_features
|
||||||
self._dict = PreshMap(2 ** 20)
|
self._dict = PreshMap(2 ** 20)
|
||||||
self.size = 0
|
self.size = 0
|
||||||
cdef String string
|
cdef String string
|
||||||
for uni_string in words:
|
cdef dict lexeme_dict
|
||||||
prob = probs.get(uni_string, 0.0)
|
cdef LexemeC* lexeme
|
||||||
cluster = clusters.get(uni_string, 0.0)
|
for lexeme_dict in lexemes:
|
||||||
cases = case_stats.get(uni_string, {})
|
string_from_unicode(&string, lexeme_dict['string'])
|
||||||
tags = tag_stats.get(uni_string, {})
|
lexeme = <LexemeC*>self._mem.alloc(1, sizeof(LexemeC))
|
||||||
views = [string_view(uni_string, prob, cluster, cases, tags)
|
lexeme.views = <char**>self._mem.alloc(len(string_features), sizeof(char*))
|
||||||
for string_view in self._string_features]
|
lexeme_unpack(lexeme, lexeme_dict)
|
||||||
flags = set()
|
|
||||||
for i, flag_feature in enumerate(self._flag_features):
|
|
||||||
if flag_feature(uni_string, prob, cluster, cases, tags):
|
|
||||||
flags.add(i)
|
|
||||||
lexeme = lexeme_init(self._mem, self.size, uni_string, prob, cluster, views, flags)
|
|
||||||
string_from_unicode(&string, uni_string)
|
|
||||||
self._dict.set(string.key, lexeme)
|
self._dict.set(string.key, lexeme)
|
||||||
self.size += 1
|
self.size += 1
|
||||||
|
|
||||||
|
|
|
@ -22,3 +22,4 @@ cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id)
|
||||||
|
|
||||||
|
|
||||||
cdef dict lexeme_pack(LexemeC* lexeme)
|
cdef dict lexeme_pack(LexemeC* lexeme)
|
||||||
|
cdef int lexeme_unpack(LexemeC* lexeme, dict p) except -1
|
||||||
|
|
|
@ -16,17 +16,15 @@ def read_lang_data(name):
|
||||||
tokenization = read_tokenization(data_dir)
|
tokenization = read_tokenization(data_dir)
|
||||||
prefix = read_prefix(data_dir)
|
prefix = read_prefix(data_dir)
|
||||||
suffix = read_suffix(data_dir)
|
suffix = read_suffix(data_dir)
|
||||||
words = load_resource(data_dir, 'words')
|
|
||||||
probs = load_resource(data_dir, 'probs')
|
|
||||||
clusters = load_resource(data_dir, 'clusters')
|
|
||||||
case_stats = load_resource(data_dir, 'case_stats')
|
|
||||||
tag_stats = load_resource(data_dir, 'tag_stats')
|
|
||||||
return tokenization, prefix, suffix, words, probs, clusters, case_stats, tag_stats
|
|
||||||
|
|
||||||
|
lex_loc = path.join(data_dir, 'lexemes.json')
|
||||||
|
if path.exists(lex_loc):
|
||||||
|
with open(lex_loc) as file_:
|
||||||
|
lexemes = ujson.load(file_)
|
||||||
|
else:
|
||||||
|
lexemes = []
|
||||||
|
return tokenization, prefix, suffix, lexemes
|
||||||
|
|
||||||
def load_resource(data_dir, name):
|
|
||||||
loc = path.join(data_dir, name + '.json')
|
|
||||||
return json.load(loc) if path.exists(loc) else {}
|
|
||||||
|
|
||||||
def read_prefix(data_dir):
|
def read_prefix(data_dir):
|
||||||
with utf8open(path.join(data_dir, 'prefix')) as file_:
|
with utf8open(path.join(data_dir, 'prefix')) as file_:
|
||||||
|
|
Loading…
Reference in New Issue