* Move special tokenization into its own lookup table, away from the cache.

This commit is contained in:
Matthew Honnibal 2014-09-12 19:43:14 +02:00
parent 985bc68327
commit 9298e36b36
3 changed files with 14 additions and 5 deletions

View File

@ -237,6 +237,7 @@ cdef class English(Language):
v_shape = View_WordShape
def __cinit__(self, name, user_string_features, user_flag_features):
self.cache = {}
self.specials = {}
lang_data = util.read_lang_data(name)
rules, words, probs, clusters, case_stats, tag_stats = lang_data
self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats,

View File

@ -72,6 +72,7 @@ cdef class Lexicon:
cdef class Language:
cdef unicode name
cdef dict cache
cdef dict specials
cpdef readonly Lexicon lexicon
cpdef readonly object tokens_class

View File

@ -41,6 +41,7 @@ cdef class Language:
string_features = []
self.name = name
self.cache = {}
self.specials = {}
lang_data = read_lang_data(name)
rules, words, probs, clusters, case_stats, tag_stats = lang_data
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
@ -88,6 +89,7 @@ cdef class Language:
cdef size_t i = 0
cdef Py_UNICODE* characters = string
cdef Py_UNICODE c
assert Py_UNICODE_ISSPACE(' ') == 1
for i in range(length):
c = characters[i]
if Py_UNICODE_ISSPACE(c) == 1:
@ -103,6 +105,11 @@ cdef class Language:
cdef list lexemes
cdef size_t lex_addr
cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0)
if hashed in self.specials:
for lex_addr in self.specials[hashed]:
tokens.push_back(<LexemeC*>lex_addr)
return 0
if hashed in self.cache:
for lex_addr in self.cache[hashed]:
tokens.push_back(<LexemeC*>lex_addr)
@ -113,16 +120,16 @@ cdef class Language:
cdef size_t split = 0
while start < length:
split = self._split_one(&characters[start], length - start)
hashed = hash64(&characters[start], split * sizeof(Py_UNICODE), 0)
if hashed in self.cache:
lexemes.extend(self.cache[hashed])
piece_hash = hash64(&characters[start], split * sizeof(Py_UNICODE), 0)
if piece_hash in self.specials:
lexemes.extend(self.specials[piece_hash])
else:
lexeme = <LexemeC*>self.lexicon.get(&characters[start], split)
lexemes.append(<size_t>lexeme)
start += split
for lex_addr in lexemes:
tokens.push_back(<LexemeC*>lex_addr)
#self.cache[hashed] = lexemes
self.cache[hashed] = lexemes
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
return length
@ -146,7 +153,7 @@ cdef class Language:
lexemes = []
for substring in substrings:
lexemes.append(self.lexicon.get(<Py_UNICODE*>substring, len(substring)))
self.cache[hashed] = lexemes
self.specials[hashed] = lexemes
cdef class Lexicon: