From 43743a5d631c9acb07c4143dd1214b4d26f77ba0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 14 Oct 2014 18:22:41 +1100 Subject: [PATCH] * Work on efficiency --- spacy/lang.pxd | 2 +- spacy/lang.pyx | 45 +++++++++++++++++++++++++-------------------- spacy/tokens.pxd | 6 +++--- spacy/tokens.pyx | 17 ++++++++++++++--- 4 files changed, 43 insertions(+), 27 deletions(-) diff --git a/spacy/lang.pxd b/spacy/lang.pxd index a3cb7c281..a1d78363c 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -51,7 +51,7 @@ cdef class Language: cpdef Tokens tokenize(self, unicode text) - cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1 + cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1 cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except NULL cdef int _attach_tokens(self, Tokens tokens, int idx, String* string, diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 2ab744402..07bdf6e9b 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -70,35 +70,40 @@ cdef class Language: cdef int start = 0 cdef int i = 0 cdef Py_UNICODE* chars = string + cdef String span for i in range(length): if Py_UNICODE_ISSPACE(chars[i]) == 1: if start < i: - self._tokenize(tokens, chars, start, i) + string_slice(&span, chars, start, i) + lexemes = self.cache.get(span.key) + if lexemes != NULL: + tokens.extend(start, lexemes, 0) + else: + self._tokenize(tokens, &span, start, i) start = i + 1 i += 1 if start < i: - self._tokenize(tokens, chars, start, i) + string_slice(&span, chars, start, i) + lexemes = self.cache.get(span.key) + if lexemes != NULL: + tokens.extend(start, lexemes, 0) + else: + self._tokenize(tokens, &span, start, i) return tokens - cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1: - cdef String span + cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1: cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] suffixes cdef uint64_t orig_key cdef int orig_size - string_slice(&span, chars, start, end) - lexemes = self.cache.get(span.key) - if lexemes != NULL: - tokens.extend(start, lexemes, 0) - else: - orig_key = span.key - orig_size = tokens.lex.size() - span = self._split_affixes(&span, &prefixes, &suffixes)[0] - self._attach_tokens(tokens, start, &span, &prefixes, &suffixes) - self._save_cached(&tokens.lex, orig_key, orig_size) + orig_key = span.key + orig_size = tokens.lex.size() + self._split_affixes(span, &prefixes, &suffixes) + self._attach_tokens(tokens, start, span, &prefixes, &suffixes) + self._save_cached(tokens.lex, orig_key, orig_size) cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes, - vector[LexemeC*] *suffixes) except NULL: + vector[LexemeC*] *suffixes) except NULL: cdef size_t i cdef String prefix cdef String suffix @@ -113,7 +118,7 @@ cdef class Language: string_slice(&minus_pre, string.chars, pre_len, string.n) # Check whether we've hit a special-case if minus_pre.n >= 1 and self.specials.get(minus_pre.key) != NULL: - string = &minus_pre + string[0] = minus_pre prefixes.push_back(self.lexicon.get(&prefix)) break suf_len = self._find_suffix(string.chars, string.n) @@ -122,7 +127,7 @@ cdef class Language: string_slice(&minus_suf, string.chars, 0, string.n - suf_len) # Check whether we've hit a special-case if minus_suf.n >= 1 and self.specials.get(minus_suf.key) != NULL: - string = &minus_suf + string[0] = minus_suf suffixes.push_back(self.lexicon.get(&suffix)) break if pre_len and suf_len and (pre_len + suf_len) <= string.n: @@ -130,10 +135,10 @@ cdef class Language: prefixes.push_back(self.lexicon.get(&prefix)) suffixes.push_back(self.lexicon.get(&suffix)) elif pre_len: - string = &minus_pre + string[0] = minus_pre prefixes.push_back(self.lexicon.get(&prefix)) elif suf_len: - string = &minus_suf + string[0] = minus_suf suffixes.push_back(self.lexicon.get(&suffix)) if self.specials.get(string.key): break @@ -271,7 +276,7 @@ cdef void string_from_unicode(String* s, unicode uni): string_slice(s, c_uni, 0, len(uni)) -cdef inline void string_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil: +cdef inline void string_slice(String* s, Py_UNICODE* chars, int start, int end) nogil: s.chars = &chars[start] s.n = end - start s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0) diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 97adea956..7c1f77644 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -3,9 +3,9 @@ from libcpp.vector cimport vector cdef class Tokens: - cdef vector[LexemeC*] lex - cdef vector[int] idx - cdef vector[int] pos + cdef vector[LexemeC*] *lex + cdef vector[int] *idx + cdef vector[int] *pos cdef int extend(self, int i, LexemeC** lexemes, int n) except -1 cdef int push_back(self, int i, LexemeC* lexeme) except -1 diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 783aa5c18..0a3a075b1 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -25,10 +25,18 @@ cdef class Tokens: """ def __cinit__(self, string_length=0): size = int(string_length / 3) if string_length >= 3 else 1 + self.lex = new vector[LexemeC*]() + self.idx = new vector[int]() + self.pos = new vector[int]() self.lex.reserve(size) self.idx.reserve(size) self.pos.reserve(size) + def __dealloc__(self): + del self.lex + del self.idx + del self.pos + def __getitem__(self, i): return Lexeme(self.lex.at(i)) @@ -38,7 +46,6 @@ cdef class Tokens: cdef int push_back(self, int idx, LexemeC* lexeme) except -1: self.lex.push_back(lexeme) self.idx.push_back(idx) - self.pos.push_back(0) return idx + lexeme.ints[LexInt_length] cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1: @@ -48,11 +55,15 @@ cdef class Tokens: elif n == 0: i = 0 while lexemes[i] != NULL: - idx = self.push_back(idx, lexemes[i]) + self.lex.push_back(lexemes[i]) + self.idx.push_back(idx) + idx += lexemes[i].ints[LexInt_length] i += 1 else: for i in range(n): - idx = self.push_back(idx, lexemes[i]) + self.lex.push_back(lexemes[i]) + self.idx.push_back(idx) + idx += lexemes[i].ints[LexInt_length] return idx cpdef int id(self, size_t i) except -1: