mirror of https://github.com/explosion/spaCy.git
* Introduce a TokenC struct, to handle token indices, pos tags and sense tags
This commit is contained in:
parent
187372c7f3
commit
1c9253701d
|
@ -6,7 +6,7 @@ from preshed.maps cimport PreshMap
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from .tokens cimport Tokens
|
from .tokens cimport Tokens, TokenC
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
from .utf8string cimport StringStore, UniStr
|
from .utf8string cimport StringStore, UniStr
|
||||||
|
|
||||||
|
@ -45,5 +45,5 @@ cdef class Language:
|
||||||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
|
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
|
||||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
|
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
|
||||||
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
|
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
|
||||||
cdef int _save_cached(self, const Lexeme* const* tokens, hash_t key, int n) except -1
|
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,7 @@ from preshed.maps cimport PreshMap
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
from .lexeme cimport EMPTY_LEXEME
|
from .lexeme cimport EMPTY_LEXEME
|
||||||
from .lexeme cimport init as lexeme_init
|
from .lexeme cimport init as lexeme_init
|
||||||
from .lexeme cimport check_flag, IS_ALPHA
|
from .lexeme cimport check_flag
|
||||||
|
|
||||||
from .utf8string cimport slice_unicode
|
from .utf8string cimport slice_unicode
|
||||||
|
|
||||||
|
@ -114,7 +114,7 @@ cdef class Language:
|
||||||
orig_size = tokens.length
|
orig_size = tokens.length
|
||||||
self._split_affixes(span, &prefixes, &suffixes)
|
self._split_affixes(span, &prefixes, &suffixes)
|
||||||
self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
|
self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
|
||||||
self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size)
|
self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)
|
||||||
|
|
||||||
cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes,
|
cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes,
|
||||||
vector[const Lexeme*] *suffixes) except NULL:
|
vector[const Lexeme*] *suffixes) except NULL:
|
||||||
|
@ -189,14 +189,14 @@ cdef class Language:
|
||||||
idx = tokens.push_back(idx, deref(it))
|
idx = tokens.push_back(idx, deref(it))
|
||||||
preinc(it)
|
preinc(it)
|
||||||
|
|
||||||
cdef int _save_cached(self, const Lexeme* const* tokens, hash_t key, int n) except -1:
|
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1:
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
if tokens[i].id == 1:
|
if tokens[i].lex.id == 1:
|
||||||
return 0
|
return 0
|
||||||
lexemes = <const Lexeme**>self.mem.alloc(n + 1, sizeof(Lexeme**))
|
lexemes = <const Lexeme**>self.mem.alloc(n + 1, sizeof(Lexeme**))
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
lexemes[i] = tokens[i]
|
lexemes[i] = tokens[i].lex
|
||||||
lexemes[i + 1] = NULL
|
lexemes[i + 1] = NULL
|
||||||
self._cache.set(key, lexemes)
|
self._cache.set(key, lexemes)
|
||||||
|
|
||||||
|
@ -255,7 +255,9 @@ cdef class Lexicon:
|
||||||
self.set_flags = set_flags
|
self.set_flags = set_flags
|
||||||
|
|
||||||
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
|
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
|
||||||
'''Retrieve a pointer to a Lexeme from the lexicon.'''
|
'''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
|
||||||
|
if necessary, using memory acquired from the given pool. If the pool
|
||||||
|
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
||||||
cdef Lexeme* lex
|
cdef Lexeme* lex
|
||||||
lex = <Lexeme*>self._map.get(string.key)
|
lex = <Lexeme*>self._map.get(string.key)
|
||||||
if lex != NULL:
|
if lex != NULL:
|
||||||
|
|
|
@ -9,18 +9,22 @@ from .typedefs cimport flags_t
|
||||||
from .utf8string cimport StringStore
|
from .utf8string cimport StringStore
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct TokenC:
|
||||||
|
const Lexeme* lex
|
||||||
|
int idx
|
||||||
|
int pos
|
||||||
|
int sense
|
||||||
|
|
||||||
|
|
||||||
|
cdef TokenC EMPTY_TOKEN = TokenC(&EMPTY_LEXEME, 0, 0, 0)
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokens:
|
cdef class Tokens:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef StringStore _string_store
|
cdef StringStore _string_store
|
||||||
|
|
||||||
cdef const Lexeme** _lex_ptr
|
cdef TokenC* _data
|
||||||
cdef int* _idx_ptr
|
cdef TokenC* data
|
||||||
cdef int* _pos_ptr
|
|
||||||
cdef int* _ner_ptr
|
|
||||||
cdef const Lexeme** lex
|
|
||||||
cdef int* idx
|
|
||||||
cdef int* pos
|
|
||||||
cdef int* ner
|
|
||||||
|
|
||||||
cdef int length
|
cdef int length
|
||||||
cdef int max_length
|
cdef int max_length
|
||||||
|
|
|
@ -40,28 +40,18 @@ cdef class Tokens:
|
||||||
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
|
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
|
||||||
# However, we need to remember the true starting places, so that we can
|
# However, we need to remember the true starting places, so that we can
|
||||||
# realloc.
|
# realloc.
|
||||||
self._lex_ptr = <const Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*))
|
self._data = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
|
||||||
self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
|
|
||||||
self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
|
|
||||||
self._ner_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
|
|
||||||
self.lex = self._lex_ptr
|
|
||||||
self.idx = self._idx_ptr
|
|
||||||
self.pos = self._pos_ptr
|
|
||||||
self.ner = self._ner_ptr
|
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(size + (PADDING*2)):
|
for i in range(size + (PADDING*2)):
|
||||||
self.lex[i] = &EMPTY_LEXEME
|
self._data[i] = EMPTY_TOKEN
|
||||||
self.lex += PADDING
|
self.data = self._data + PADDING
|
||||||
self.idx += PADDING
|
|
||||||
self.pos += PADDING
|
|
||||||
self.ner += PADDING
|
|
||||||
self.max_length = size
|
self.max_length = size
|
||||||
self.length = 0
|
self.length = 0
|
||||||
|
|
||||||
def __getitem__(self, i):
|
def __getitem__(self, i):
|
||||||
bounds_check(i, self.length, PADDING)
|
bounds_check(i, self.length, PADDING)
|
||||||
return Token(self._string_store, i, self.idx[i], self.pos[i], self.ner[i],
|
return Token(self._string_store, i, self.data[i].idx, self.data[i].pos,
|
||||||
self.lex[i][0])
|
self.data[i].sense, self.data[i].lex[0])
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
|
@ -73,10 +63,11 @@ cdef class Tokens:
|
||||||
cdef int push_back(self, int idx, const Lexeme* lexeme) except -1:
|
cdef int push_back(self, int idx, const Lexeme* lexeme) except -1:
|
||||||
if self.length == self.max_length:
|
if self.length == self.max_length:
|
||||||
self._realloc(self.length * 2)
|
self._realloc(self.length * 2)
|
||||||
self.lex[self.length] = lexeme
|
cdef TokenC* t = &self.data[self.length]
|
||||||
self.idx[self.length] = idx
|
t.lex = lexeme
|
||||||
self.pos[self.length] = 0
|
t.idx = idx
|
||||||
self.ner[self.length] = 0
|
t.pos = 0
|
||||||
|
t.sense = 0
|
||||||
self.length += 1
|
self.length += 1
|
||||||
return idx + lexeme.length
|
return idx + lexeme.length
|
||||||
|
|
||||||
|
@ -108,7 +99,7 @@ cdef class Tokens:
|
||||||
output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int)
|
output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int)
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
for j, feature in enumerate(attr_ids):
|
for j, feature in enumerate(attr_ids):
|
||||||
output[i, j] = get_attr(self.lex[i], feature)
|
output[i, j] = get_attr(self.data[i].lex, feature)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def count_by(self, attr_id_t attr_id):
|
def count_by(self, attr_id_t attr_id):
|
||||||
|
@ -118,23 +109,18 @@ cdef class Tokens:
|
||||||
|
|
||||||
cdef PreshCounter counts = PreshCounter(2 ** 8)
|
cdef PreshCounter counts = PreshCounter(2 ** 8)
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
attr = get_attr(self.lex[i], attr_id)
|
attr = get_attr(self.data[i].lex, attr_id)
|
||||||
counts.inc(attr, 1)
|
counts.inc(attr, 1)
|
||||||
return dict(counts)
|
return dict(counts)
|
||||||
|
|
||||||
def _realloc(self, new_size):
|
def _realloc(self, new_size):
|
||||||
self.max_length = new_size
|
self.max_length = new_size
|
||||||
n = new_size + (PADDING * 2)
|
n = new_size + (PADDING * 2)
|
||||||
self._lex_ptr = <const Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*))
|
self._data = <TokenC*>self.mem.realloc(self._data, n * sizeof(TokenC))
|
||||||
self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
|
self.data = self._data + PADDING
|
||||||
self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
|
cdef int i
|
||||||
self._ner_ptr = <int*>self.mem.realloc(self._ner_ptr, n * sizeof(int))
|
|
||||||
self.lex = self._lex_ptr + PADDING
|
|
||||||
self.idx = self._idx_ptr + PADDING
|
|
||||||
self.pos = self._pos_ptr + PADDING
|
|
||||||
self.ner = self._ner_ptr + PADDING
|
|
||||||
for i in range(self.length, self.max_length + PADDING):
|
for i in range(self.length, self.max_length + PADDING):
|
||||||
self.lex[i] = &EMPTY_LEXEME
|
self.data[i] = EMPTY_TOKEN
|
||||||
|
|
||||||
|
|
||||||
@cython.freelist(64)
|
@cython.freelist(64)
|
||||||
|
|
Loading…
Reference in New Issue