diff --git a/spacy/lang.pxd b/spacy/lang.pxd index d27378816..fd4cf6e70 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -6,7 +6,7 @@ from preshed.maps cimport PreshMap from cymem.cymem cimport Pool from .typedefs cimport hash_t -from .tokens cimport Tokens +from .tokens cimport Tokens, TokenC from .lexeme cimport Lexeme from .utf8string cimport StringStore, UniStr @@ -45,5 +45,5 @@ cdef class Language: cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1 - cdef int _save_cached(self, const Lexeme* const* tokens, hash_t key, int n) except -1 + cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1 diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 100b51a98..1fdd683f3 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -18,7 +18,7 @@ from preshed.maps cimport PreshMap from .lexeme cimport Lexeme from .lexeme cimport EMPTY_LEXEME from .lexeme cimport init as lexeme_init -from .lexeme cimport check_flag, IS_ALPHA +from .lexeme cimport check_flag from .utf8string cimport slice_unicode @@ -114,7 +114,7 @@ cdef class Language: orig_size = tokens.length self._split_affixes(span, &prefixes, &suffixes) self._attach_tokens(tokens, start, span, &prefixes, &suffixes) - self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size) + self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size) cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes, vector[const Lexeme*] *suffixes) except NULL: @@ -189,14 +189,14 @@ cdef class Language: idx = tokens.push_back(idx, deref(it)) preinc(it) - cdef int _save_cached(self, const Lexeme* const* tokens, hash_t key, int n) except -1: + cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1: cdef int i for i in range(n): - if tokens[i].id == 1: + if tokens[i].lex.id == 1: return 0 lexemes = self.mem.alloc(n + 1, sizeof(Lexeme**)) for i in range(n): - lexemes[i] = tokens[i] + lexemes[i] = tokens[i].lex lexemes[i + 1] = NULL self._cache.set(key, lexemes) @@ -255,7 +255,9 @@ cdef class Lexicon: self.set_flags = set_flags cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL: - '''Retrieve a pointer to a Lexeme from the lexicon.''' + '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme + if necessary, using memory acquired from the given pool. If the pool + is the lexicon's own memory, the lexeme is saved in the lexicon.''' cdef Lexeme* lex lex = self._map.get(string.key) if lex != NULL: diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 90356b74e..a219c707f 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -9,18 +9,22 @@ from .typedefs cimport flags_t from .utf8string cimport StringStore +cdef struct TokenC: + const Lexeme* lex + int idx + int pos + int sense + + +cdef TokenC EMPTY_TOKEN = TokenC(&EMPTY_LEXEME, 0, 0, 0) + + cdef class Tokens: cdef Pool mem cdef StringStore _string_store - cdef const Lexeme** _lex_ptr - cdef int* _idx_ptr - cdef int* _pos_ptr - cdef int* _ner_ptr - cdef const Lexeme** lex - cdef int* idx - cdef int* pos - cdef int* ner + cdef TokenC* _data + cdef TokenC* data cdef int length cdef int max_length diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index c06a1b4d8..06d3eeb99 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -40,28 +40,18 @@ cdef class Tokens: # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds # However, we need to remember the true starting places, so that we can # realloc. - self._lex_ptr = self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*)) - self._idx_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int)) - self._pos_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int)) - self._ner_ptr = self.mem.alloc(size + (PADDING*2), sizeof(int)) - self.lex = self._lex_ptr - self.idx = self._idx_ptr - self.pos = self._pos_ptr - self.ner = self._ner_ptr + self._data = self.mem.alloc(size + (PADDING*2), sizeof(TokenC)) cdef int i for i in range(size + (PADDING*2)): - self.lex[i] = &EMPTY_LEXEME - self.lex += PADDING - self.idx += PADDING - self.pos += PADDING - self.ner += PADDING + self._data[i] = EMPTY_TOKEN + self.data = self._data + PADDING self.max_length = size self.length = 0 def __getitem__(self, i): bounds_check(i, self.length, PADDING) - return Token(self._string_store, i, self.idx[i], self.pos[i], self.ner[i], - self.lex[i][0]) + return Token(self._string_store, i, self.data[i].idx, self.data[i].pos, + self.data[i].sense, self.data[i].lex[0]) def __iter__(self): for i in range(self.length): @@ -73,10 +63,11 @@ cdef class Tokens: cdef int push_back(self, int idx, const Lexeme* lexeme) except -1: if self.length == self.max_length: self._realloc(self.length * 2) - self.lex[self.length] = lexeme - self.idx[self.length] = idx - self.pos[self.length] = 0 - self.ner[self.length] = 0 + cdef TokenC* t = &self.data[self.length] + t.lex = lexeme + t.idx = idx + t.pos = 0 + t.sense = 0 self.length += 1 return idx + lexeme.length @@ -108,7 +99,7 @@ cdef class Tokens: output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int) for i in range(self.length): for j, feature in enumerate(attr_ids): - output[i, j] = get_attr(self.lex[i], feature) + output[i, j] = get_attr(self.data[i].lex, feature) return output def count_by(self, attr_id_t attr_id): @@ -118,23 +109,18 @@ cdef class Tokens: cdef PreshCounter counts = PreshCounter(2 ** 8) for i in range(self.length): - attr = get_attr(self.lex[i], attr_id) + attr = get_attr(self.data[i].lex, attr_id) counts.inc(attr, 1) return dict(counts) def _realloc(self, new_size): self.max_length = new_size n = new_size + (PADDING * 2) - self._lex_ptr = self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*)) - self._idx_ptr = self.mem.realloc(self._idx_ptr, n * sizeof(int)) - self._pos_ptr = self.mem.realloc(self._pos_ptr, n * sizeof(int)) - self._ner_ptr = self.mem.realloc(self._ner_ptr, n * sizeof(int)) - self.lex = self._lex_ptr + PADDING - self.idx = self._idx_ptr + PADDING - self.pos = self._pos_ptr + PADDING - self.ner = self._ner_ptr + PADDING + self._data = self.mem.realloc(self._data, n * sizeof(TokenC)) + self.data = self._data + PADDING + cdef int i for i in range(self.length, self.max_length + PADDING): - self.lex[i] = &EMPTY_LEXEME + self.data[i] = EMPTY_TOKEN @cython.freelist(64)