mirror of https://github.com/explosion/spaCy.git
* Work on efficiency
This commit is contained in:
parent
6fb42c4919
commit
43743a5d63
|
@ -51,7 +51,7 @@ cdef class Language:
|
|||
|
||||
cpdef Tokens tokenize(self, unicode text)
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1
|
||||
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
|
||||
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
|
||||
vector[LexemeC*] *suffixes) except NULL
|
||||
cdef int _attach_tokens(self, Tokens tokens, int idx, String* string,
|
||||
|
|
|
@ -70,32 +70,37 @@ cdef class Language:
|
|||
cdef int start = 0
|
||||
cdef int i = 0
|
||||
cdef Py_UNICODE* chars = string
|
||||
cdef String span
|
||||
for i in range(length):
|
||||
if Py_UNICODE_ISSPACE(chars[i]) == 1:
|
||||
if start < i:
|
||||
self._tokenize(tokens, chars, start, i)
|
||||
start = i + 1
|
||||
i += 1
|
||||
if start < i:
|
||||
self._tokenize(tokens, chars, start, i)
|
||||
return tokens
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* chars, int start, int end) except -1:
|
||||
cdef String span
|
||||
cdef vector[LexemeC*] prefixes
|
||||
cdef vector[LexemeC*] suffixes
|
||||
cdef uint64_t orig_key
|
||||
cdef int orig_size
|
||||
string_slice(&span, chars, start, end)
|
||||
string_slice(&span, chars, start, i)
|
||||
lexemes = <LexemeC**>self.cache.get(span.key)
|
||||
if lexemes != NULL:
|
||||
tokens.extend(start, lexemes, 0)
|
||||
else:
|
||||
self._tokenize(tokens, &span, start, i)
|
||||
start = i + 1
|
||||
i += 1
|
||||
if start < i:
|
||||
string_slice(&span, chars, start, i)
|
||||
lexemes = <LexemeC**>self.cache.get(span.key)
|
||||
if lexemes != NULL:
|
||||
tokens.extend(start, lexemes, 0)
|
||||
else:
|
||||
self._tokenize(tokens, &span, start, i)
|
||||
return tokens
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
|
||||
cdef vector[LexemeC*] prefixes
|
||||
cdef vector[LexemeC*] suffixes
|
||||
cdef uint64_t orig_key
|
||||
cdef int orig_size
|
||||
orig_key = span.key
|
||||
orig_size = tokens.lex.size()
|
||||
span = self._split_affixes(&span, &prefixes, &suffixes)[0]
|
||||
self._attach_tokens(tokens, start, &span, &prefixes, &suffixes)
|
||||
self._save_cached(&tokens.lex, orig_key, orig_size)
|
||||
self._split_affixes(span, &prefixes, &suffixes)
|
||||
self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
|
||||
self._save_cached(tokens.lex, orig_key, orig_size)
|
||||
|
||||
cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes,
|
||||
vector[LexemeC*] *suffixes) except NULL:
|
||||
|
@ -113,7 +118,7 @@ cdef class Language:
|
|||
string_slice(&minus_pre, string.chars, pre_len, string.n)
|
||||
# Check whether we've hit a special-case
|
||||
if minus_pre.n >= 1 and self.specials.get(minus_pre.key) != NULL:
|
||||
string = &minus_pre
|
||||
string[0] = minus_pre
|
||||
prefixes.push_back(self.lexicon.get(&prefix))
|
||||
break
|
||||
suf_len = self._find_suffix(string.chars, string.n)
|
||||
|
@ -122,7 +127,7 @@ cdef class Language:
|
|||
string_slice(&minus_suf, string.chars, 0, string.n - suf_len)
|
||||
# Check whether we've hit a special-case
|
||||
if minus_suf.n >= 1 and self.specials.get(minus_suf.key) != NULL:
|
||||
string = &minus_suf
|
||||
string[0] = minus_suf
|
||||
suffixes.push_back(self.lexicon.get(&suffix))
|
||||
break
|
||||
if pre_len and suf_len and (pre_len + suf_len) <= string.n:
|
||||
|
@ -130,10 +135,10 @@ cdef class Language:
|
|||
prefixes.push_back(self.lexicon.get(&prefix))
|
||||
suffixes.push_back(self.lexicon.get(&suffix))
|
||||
elif pre_len:
|
||||
string = &minus_pre
|
||||
string[0] = minus_pre
|
||||
prefixes.push_back(self.lexicon.get(&prefix))
|
||||
elif suf_len:
|
||||
string = &minus_suf
|
||||
string[0] = minus_suf
|
||||
suffixes.push_back(self.lexicon.get(&suffix))
|
||||
if self.specials.get(string.key):
|
||||
break
|
||||
|
@ -271,7 +276,7 @@ cdef void string_from_unicode(String* s, unicode uni):
|
|||
string_slice(s, c_uni, 0, len(uni))
|
||||
|
||||
|
||||
cdef inline void string_slice(String* s, Py_UNICODE* chars, size_t start, size_t end) nogil:
|
||||
cdef inline void string_slice(String* s, Py_UNICODE* chars, int start, int end) nogil:
|
||||
s.chars = &chars[start]
|
||||
s.n = end - start
|
||||
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
||||
|
|
|
@ -3,9 +3,9 @@ from libcpp.vector cimport vector
|
|||
|
||||
|
||||
cdef class Tokens:
|
||||
cdef vector[LexemeC*] lex
|
||||
cdef vector[int] idx
|
||||
cdef vector[int] pos
|
||||
cdef vector[LexemeC*] *lex
|
||||
cdef vector[int] *idx
|
||||
cdef vector[int] *pos
|
||||
|
||||
cdef int extend(self, int i, LexemeC** lexemes, int n) except -1
|
||||
cdef int push_back(self, int i, LexemeC* lexeme) except -1
|
||||
|
|
|
@ -25,10 +25,18 @@ cdef class Tokens:
|
|||
"""
|
||||
def __cinit__(self, string_length=0):
|
||||
size = int(string_length / 3) if string_length >= 3 else 1
|
||||
self.lex = new vector[LexemeC*]()
|
||||
self.idx = new vector[int]()
|
||||
self.pos = new vector[int]()
|
||||
self.lex.reserve(size)
|
||||
self.idx.reserve(size)
|
||||
self.pos.reserve(size)
|
||||
|
||||
def __dealloc__(self):
|
||||
del self.lex
|
||||
del self.idx
|
||||
del self.pos
|
||||
|
||||
def __getitem__(self, i):
|
||||
return Lexeme(<size_t>self.lex.at(i))
|
||||
|
||||
|
@ -38,7 +46,6 @@ cdef class Tokens:
|
|||
cdef int push_back(self, int idx, LexemeC* lexeme) except -1:
|
||||
self.lex.push_back(lexeme)
|
||||
self.idx.push_back(idx)
|
||||
self.pos.push_back(0)
|
||||
return idx + lexeme.ints[<int>LexInt_length]
|
||||
|
||||
cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
|
||||
|
@ -48,11 +55,15 @@ cdef class Tokens:
|
|||
elif n == 0:
|
||||
i = 0
|
||||
while lexemes[i] != NULL:
|
||||
idx = self.push_back(idx, lexemes[i])
|
||||
self.lex.push_back(lexemes[i])
|
||||
self.idx.push_back(idx)
|
||||
idx += lexemes[i].ints[<int>LexInt_length]
|
||||
i += 1
|
||||
else:
|
||||
for i in range(n):
|
||||
idx = self.push_back(idx, lexemes[i])
|
||||
self.lex.push_back(lexemes[i])
|
||||
self.idx.push_back(idx)
|
||||
idx += lexemes[i].ints[<int>LexInt_length]
|
||||
return idx
|
||||
|
||||
cpdef int id(self, size_t i) except -1:
|
||||
|
|
Loading…
Reference in New Issue