diff --git a/spacy/lang.pxd b/spacy/lang.pxd index a1d78363c..82078ff12 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -59,5 +59,5 @@ cdef class Language: cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1 - cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1 + cdef int _save_cached(self, LexemeC** tokens, uint64_t key, int n) except -1 diff --git a/spacy/lang.pyx b/spacy/lang.pyx index fb9ae597e..0241413d0 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -101,10 +101,10 @@ cdef class Language: cdef uint64_t orig_key cdef int orig_size orig_key = span.key - orig_size = tokens.lex.size() + orig_size = tokens.length self._split_affixes(span, &prefixes, &suffixes) self._attach_tokens(tokens, start, span, &prefixes, &suffixes) - self._save_cached(tokens.lex, orig_key, orig_size) + self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size) cdef String* _split_affixes(self, String* string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except NULL: @@ -177,12 +177,11 @@ cdef class Language: idx = tokens.push_back(idx, deref(it)) preinc(it) - cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1: - assert tokens.size() > n - lexemes = self._mem.alloc((tokens.size() - n) + 1, sizeof(LexemeC**)) - cdef size_t i, j - for i, j in enumerate(range(n, tokens.size())): - lexemes[i] = tokens.at(j) + cdef int _save_cached(self, LexemeC** tokens, uint64_t key, int n) except -1: + lexemes = self._mem.alloc(n + 1, sizeof(LexemeC**)) + cdef int i + for i in range(n): + lexemes[i] = tokens[i] lexemes[i + 1] = NULL self.cache.set(key, lexemes) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 1a781d59c..fe68a96ea 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -73,6 +73,9 @@ cdef struct LexemeC: flag_t dist_flags +cdef LexemeC EMPTY_LEXEME + + cpdef dict get_lexeme_dict(size_t i, unicode string) cdef char* intern_and_encode(unicode string, size_t* length) except NULL diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 0d5f0a0f5..1f1d793ad 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -1,10 +1,13 @@ from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool +from libc.string cimport memset + import orth OOV_DIST_FLAGS = 0 +memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) cpdef dict get_lexeme_dict(size_t i, unicode string): ints = [None for _ in range(LexInt_N)] diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index bc5c5fe1d..b39324fe2 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -1,21 +1,24 @@ +from cymem.cymem cimport Pool + from spacy.lexeme cimport LexemeC -from libcpp.vector cimport vector from thinc.typedefs cimport atom_t cdef class Tokens: - cdef vector[LexemeC*] *lex - cdef vector[int] *idx - cdef vector[int] *pos + cdef Pool mem + + cdef LexemeC** _lex_ptr + cdef int* _idx_ptr + cdef int* _pos_ptr + cdef LexemeC** lex + cdef int* idx + cdef int* pos + + cdef int length + cdef int max_length cdef int extend(self, int i, LexemeC** lexemes, int n) except -1 cdef int push_back(self, int i, LexemeC* lexeme) except -1 - cdef int int_array(self, atom_t* atoms, int i, int* indices, int n_idx, - int* features, int n_feat) except -1 - cdef int string_array(self, atom_t* atoms, int i, int* indices, int n_idx, - int* features, int n_feat) except -1 - cdef int bool_array(self, atom_t* atoms, int i, int* indices, int n_idx, - int* features, int n_feat) except -1 cpdef int id(self, size_t i) except -1 cpdef float prob(self, size_t i) except 1 diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 58c26d7f9..7b81f6cf1 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -6,6 +6,14 @@ cimport numpy cimport cython import numpy +DEF PADDING = 5 + +cdef int bounds_check(int i, int length, int padding) except -1: + if (i + padding) < 0: + raise IndexError + if (i - padding) >= length: + raise IndexError + cdef class Tokens: """A sequence of references to Lexeme objects. @@ -26,71 +34,58 @@ cdef class Tokens: >>> tokens.can_noun(1) True """ - def __cinit__(self, string_length=0): - size = int(string_length / 3) if string_length >= 3 else 1 - self.lex = new vector[LexemeC*]() - self.idx = new vector[int]() - self.pos = new vector[int]() - self.lex.reserve(size) - self.idx.reserve(size) - self.pos.reserve(size) + def __init__(self, string_length=0): + if string_length >= 3: + size = int(string_length / 3.0) + else: + size = 5 + self.mem = Pool() + # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds + # However, we need to remember the true starting places, so that we can + # realloc. + self._lex_ptr = self.mem.alloc(size + (PADDING*2), sizeof(LexemeC*)) + self._idx_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int)) + self._pos_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int)) + self.lex = self._lex_ptr + self.idx = self._idx_ptr + self.pos = self._pos_ptr + for i in range(PADDING): + self.lex[i] = &EMPTY_LEXEME + for i in range(size, PADDING): + self.lex[i] = &EMPTY_LEXEME + self.lex += PADDING + self.idx += PADDING + self.pos += PADDING - def __dealloc__(self): - del self.lex - del self.idx - del self.pos + self.max_length = size + self.length = 0 def __getitem__(self, i): - if i >= self.lex.size(): - raise IndexError - return Lexeme(self.lex.at(i)) + bounds_check(i, self.length, PADDING) + return Lexeme(self.lex[i]) def __len__(self): - return self.lex.size() + return self.length cdef int push_back(self, int idx, LexemeC* lexeme) except -1: - self.lex.push_back(lexeme) - self.idx.push_back(idx) + if self.length == self.max_length: + self._realloc(self.length * 2) + self.lex[self.length] = lexeme + self.idx[self.length] = idx + self.pos[self.length] = 0 + self.length += 1 return idx + lexeme.ints[LexInt_length] - cdef int int_array(self, atom_t* output, int i, int* indices, int n_idx, - int* features, int n_feat): - cdef int feat_id, idx - cdef int length = self.lex.size() - for feat_id in features[:n_feat]: - for idx in indices[:n_idx]: - if idx < 0 or idx >= length: - output[i] = 0 - else: - output[i] = self.lex[0][idx].ints[feat_id] - i += 1 - return i + def _realloc(self, new_size): + self.max_length = new_size + n = new_size + (PADDING * 2) + self._lex_ptr = self.mem.realloc(self._lex_ptr, n * sizeof(LexemeC*)) + self._idx_ptr = self.mem.realloc(self._idx_ptr, n * sizeof(int)) + self._pos_ptr = self.mem.realloc(self._pos_ptr, n * sizeof(int)) + self.lex = self._lex_ptr + PADDING + self.idx = self._idx_ptr + PADDING + self.pos = self._pos_ptr + PADDING - cdef int string_array(self, atom_t* output, int i, int* indices, int n_idx, - int* features, int n_feat): - cdef int feat_id, idx - cdef int length = self.lex.size() - for feat_id in features[:n_feat]: - for idx in indices[:n_idx]: - if idx < 0 or idx >= length: - output[i] = 0 - else: - output[i] = self.lex[0][idx].strings[feat_id] - i += 1 - return i - - cdef int bool_array(self, atom_t* output, int i, int* indices, int n_idx, - int* features, int n_feat): - cdef int feat_id, idx - cdef int length = self.lex.size() - for feat_id in features[:n_feat]: - for idx in indices[:n_idx]: - if idx < 0 or idx >= length: - output[i] = 0 - else: - output[i] = lexeme_check_dist_flag(self.lex[0][idx], feat_id) - i += 1 - return i cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1: cdef int i @@ -99,131 +94,161 @@ cdef class Tokens: elif n == 0: i = 0 while lexemes[i] != NULL: - self.lex.push_back(lexemes[i]) - self.idx.push_back(idx) - idx += lexemes[i].ints[LexInt_length] + idx = self.push_back(idx, lexemes[i]) i += 1 else: for i in range(n): - self.lex.push_back(lexemes[i]) - self.idx.push_back(idx) - idx += lexemes[i].ints[LexInt_length] + idx = self.push_back(idx, lexemes[i]) return idx cpdef int id(self, size_t i) except -1: - return self.lex.at(i).ints[LexInt_id] + bounds_check(i, self.length, PADDING) + return self.lex[i].ints[LexInt_id] cpdef float prob(self, size_t i) except 1: - return self.lex.at(i).floats[LexFloat_prob] + bounds_check(i, self.length, PADDING) + return self.lex[i].floats[LexFloat_prob] cpdef int cluster(self, size_t i) except *: - return self.lex.at(i).ints[LexInt_cluster] + bounds_check(i, self.length, PADDING) + return self.lex[i].ints[LexInt_cluster] cpdef bint check_orth_flag(self, size_t i, size_t flag_id) except *: - return lexeme_check_orth_flag(self.lex.at(i), flag_id) + bounds_check(i, self.length, PADDING) + return lexeme_check_orth_flag(self.lex[i], flag_id) cpdef bint check_dist_flag(self, size_t i, size_t flag_id) except *: - return lexeme_check_dist_flag(self.lex.at(i), flag_id) + bounds_check(i, self.length, PADDING) + return lexeme_check_dist_flag(self.lex[i], flag_id) cpdef unicode string_view(self, size_t i, size_t view_id): - return lexeme_get_string(self.lex.at(i), view_id) + bounds_check(i, self.length, PADDING) + return lexeme_get_string(self.lex[i], view_id) # Provide accessor methods for the features supported by the language. # Without these, clients have to use the underlying string_view and check_flag # methods, which requires them to know the IDs. cpdef unicode string(self, size_t i): - if i >= self.lex.size(): - raise IndexError + bounds_check(i, self.length, PADDING) return self.orig(i) cpdef unicode orig(self, size_t i): - cdef bytes utf8_string = self.lex.at(i).strings[LexStr_orig] + bounds_check(i, self.length, PADDING) + cdef bytes utf8_string = self.lex[i].strings[LexStr_orig] cdef unicode string = utf8_string.decode('utf8') return string cpdef unicode norm(self, size_t i): - cdef bytes utf8_string = self.lex.at(i).strings[LexStr_norm] + bounds_check(i, self.length, PADDING) + cdef bytes utf8_string = self.lex[i].strings[LexStr_norm] cdef unicode string = utf8_string.decode('utf8') return string cpdef unicode shape(self, size_t i): - return lexeme_get_string(self.lex.at(i), LexStr_shape) + bounds_check(i, self.length, PADDING) + return lexeme_get_string(self.lex[i], LexStr_shape) cpdef unicode unsparse(self, size_t i): - return lexeme_get_string(self.lex.at(i), LexStr_unsparse) + bounds_check(i, self.length, PADDING) + return lexeme_get_string(self.lex[i], LexStr_unsparse) cpdef unicode asciied(self, size_t i): - return lexeme_get_string(self.lex.at(i), LexStr_asciied) + bounds_check(i, self.length, PADDING) + return lexeme_get_string(self.lex[i], LexStr_asciied) cpdef bint is_alpha(self, size_t i) except *: - return lexeme_check_orth_flag(self.lex.at(i), LexOrth_alpha) + bounds_check(i, self.length, PADDING) + return lexeme_check_orth_flag(self.lex[i], LexOrth_alpha) cpdef bint is_ascii(self, size_t i) except *: - return lexeme_check_orth_flag(self.lex.at(i), LexOrth_ascii) + bounds_check(i, self.length, PADDING) + return lexeme_check_orth_flag(self.lex[i], LexOrth_ascii) cpdef bint is_digit(self, size_t i) except *: - return lexeme_check_orth_flag(self.lex.at(i), LexOrth_digit) + bounds_check(i, self.length, PADDING) + return lexeme_check_orth_flag(self.lex[i], LexOrth_digit) cpdef bint is_lower(self, size_t i) except *: - return lexeme_check_orth_flag(self.lex.at(i), LexOrth_lower) + bounds_check(i, self.length, PADDING) + return lexeme_check_orth_flag(self.lex[i], LexOrth_lower) cpdef bint is_punct(self, size_t i) except *: - return lexeme_check_orth_flag(self.lex.at(i), LexOrth_punct) + bounds_check(i, self.length, PADDING) + return lexeme_check_orth_flag(self.lex[i], LexOrth_punct) cpdef bint is_space(self, size_t i) except *: - return lexeme_check_orth_flag(self.lex.at(i), LexOrth_space) + bounds_check(i, self.length, PADDING) + return lexeme_check_orth_flag(self.lex[i], LexOrth_space) cpdef bint is_title(self, size_t i) except *: - return lexeme_check_orth_flag(self.lex.at(i), LexOrth_title) + bounds_check(i, self.length, PADDING) + return lexeme_check_orth_flag(self.lex[i], LexOrth_title) cpdef bint is_upper(self, size_t i) except *: - return lexeme_check_orth_flag(self.lex.at(i), LexOrth_upper) + bounds_check(i, self.length, PADDING) + return lexeme_check_orth_flag(self.lex[i], LexOrth_upper) cpdef bint can_adj(self, size_t i) except *: - return lexeme_check_dist_flag(self.lex.at(i), LexDist_adj) + bounds_check(i, self.length, PADDING) + return lexeme_check_dist_flag(self.lex[i], LexDist_adj) cpdef bint can_adp(self, size_t i) except *: - return lexeme_check_dist_flag(self.lex.at(i), LexDist_adp) + bounds_check(i, self.length, PADDING) + return lexeme_check_dist_flag(self.lex[i], LexDist_adp) cpdef bint can_adv(self, size_t i) except *: - return lexeme_check_dist_flag(self.lex.at(i), LexDist_adv) + bounds_check(i, self.length, PADDING) + return lexeme_check_dist_flag(self.lex[i], LexDist_adv) cpdef bint can_conj(self, size_t i) except *: - return lexeme_check_dist_flag(self.lex.at(i), LexDist_conj) + bounds_check(i, self.length, PADDING) + return lexeme_check_dist_flag(self.lex[i], LexDist_conj) cpdef bint can_det(self, size_t i) except *: - return lexeme_check_dist_flag(self.lex.at(i), LexDist_det) + bounds_check(i, self.length, PADDING) + return lexeme_check_dist_flag(self.lex[i], LexDist_det) cpdef bint can_noun(self, size_t i) except *: - return lexeme_check_dist_flag(self.lex.at(i), LexDist_noun) + bounds_check(i, self.length, PADDING) + return lexeme_check_dist_flag(self.lex[i], LexDist_noun) cpdef bint can_num(self, size_t i) except *: - return lexeme_check_dist_flag(self.lex.at(i), LexDist_num) + bounds_check(i, self.length, PADDING) + return lexeme_check_dist_flag(self.lex[i], LexDist_num) cpdef bint can_pdt(self, size_t i) except *: - return lexeme_check_dist_flag(self.lex.at(i), LexDist_pdt) + bounds_check(i, self.length, PADDING) + return lexeme_check_dist_flag(self.lex[i], LexDist_pdt) cpdef bint can_pos(self, size_t i) except *: - return lexeme_check_dist_flag(self.lex.at(i), LexDist_pos) + bounds_check(i, self.length, PADDING) + return lexeme_check_dist_flag(self.lex[i], LexDist_pos) cpdef bint can_pron(self, size_t i) except *: - return lexeme_check_dist_flag(self.lex.at(i), LexDist_pron) + bounds_check(i, self.length, PADDING) + return lexeme_check_dist_flag(self.lex[i], LexDist_pron) cpdef bint can_prt(self, size_t i) except *: - return lexeme_check_dist_flag(self.lex.at(i), LexDist_prt) + bounds_check(i, self.length, PADDING) + return lexeme_check_dist_flag(self.lex[i], LexDist_prt) cpdef bint can_punct(self, size_t i) except *: - return lexeme_check_dist_flag(self.lex.at(i), LexDist_punct) + bounds_check(i, self.length, PADDING) + return lexeme_check_dist_flag(self.lex[i], LexDist_punct) cpdef bint can_verb(self, size_t i) except *: - return lexeme_check_dist_flag(self.lex.at(i), LexDist_verb) + bounds_check(i, self.length, PADDING) + return lexeme_check_dist_flag(self.lex[i], LexDist_verb) cpdef bint oft_lower(self, size_t i) except *: - return lexeme_check_dist_flag(self.lex.at(i), LexDist_lower) + bounds_check(i, self.length, PADDING) + return lexeme_check_dist_flag(self.lex[i], LexDist_lower) cpdef bint oft_title(self, size_t i) except *: - return lexeme_check_dist_flag(self.lex.at(i), LexDist_title) + bounds_check(i, self.length, PADDING) + return lexeme_check_dist_flag(self.lex[i], LexDist_title) cpdef bint oft_upper(self, size_t i) except *: - return lexeme_check_dist_flag(self.lex.at(i), LexDist_upper) + bounds_check(i, self.length, PADDING) + return lexeme_check_dist_flag(self.lex[i], LexDist_upper)