From 9298e36b363e676f962661a002cd8104c0940f45 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 12 Sep 2014 19:43:14 +0200 Subject: [PATCH] * Move special tokenization into its own lookup table, away from the cache. --- spacy/en.pyx | 1 + spacy/lang.pxd | 1 + spacy/lang.pyx | 17 ++++++++++++----- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/spacy/en.pyx b/spacy/en.pyx index 6300fda25..9d9f4f596 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -237,6 +237,7 @@ cdef class English(Language): v_shape = View_WordShape def __cinit__(self, name, user_string_features, user_flag_features): self.cache = {} + self.specials = {} lang_data = util.read_lang_data(name) rules, words, probs, clusters, case_stats, tag_stats = lang_data self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats, diff --git a/spacy/lang.pxd b/spacy/lang.pxd index a498e6e0f..094b3d580 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -72,6 +72,7 @@ cdef class Lexicon: cdef class Language: cdef unicode name cdef dict cache + cdef dict specials cpdef readonly Lexicon lexicon cpdef readonly object tokens_class diff --git a/spacy/lang.pyx b/spacy/lang.pyx index b11e78921..9e57c63af 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -41,6 +41,7 @@ cdef class Language: string_features = [] self.name = name self.cache = {} + self.specials = {} lang_data = read_lang_data(name) rules, words, probs, clusters, case_stats, tag_stats = lang_data self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats, @@ -88,6 +89,7 @@ cdef class Language: cdef size_t i = 0 cdef Py_UNICODE* characters = string cdef Py_UNICODE c + assert Py_UNICODE_ISSPACE(' ') == 1 for i in range(length): c = characters[i] if Py_UNICODE_ISSPACE(c) == 1: @@ -103,6 +105,11 @@ cdef class Language: cdef list lexemes cdef size_t lex_addr cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0) + if hashed in self.specials: + for lex_addr in self.specials[hashed]: + tokens.push_back(lex_addr) + return 0 + if hashed in self.cache: for lex_addr in self.cache[hashed]: tokens.push_back(lex_addr) @@ -113,16 +120,16 @@ cdef class Language: cdef size_t split = 0 while start < length: split = self._split_one(&characters[start], length - start) - hashed = hash64(&characters[start], split * sizeof(Py_UNICODE), 0) - if hashed in self.cache: - lexemes.extend(self.cache[hashed]) + piece_hash = hash64(&characters[start], split * sizeof(Py_UNICODE), 0) + if piece_hash in self.specials: + lexemes.extend(self.specials[piece_hash]) else: lexeme = self.lexicon.get(&characters[start], split) lexemes.append(lexeme) start += split for lex_addr in lexemes: tokens.push_back(lex_addr) - #self.cache[hashed] = lexemes + self.cache[hashed] = lexemes cdef int _split_one(self, Py_UNICODE* characters, size_t length): return length @@ -146,7 +153,7 @@ cdef class Language: lexemes = [] for substring in substrings: lexemes.append(self.lexicon.get(substring, len(substring))) - self.cache[hashed] = lexemes + self.specials[hashed] = lexemes cdef class Lexicon: