* Introduce a TokenC struct, to handle token indices, pos tags and sense tags

This commit is contained in:
Matthew Honnibal 2014-12-05 15:56:14 +11:00
parent 187372c7f3
commit 1c9253701d
4 changed files with 38 additions and 46 deletions

View File

@ -6,7 +6,7 @@ from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .tokens cimport Tokens from .tokens cimport Tokens, TokenC
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .utf8string cimport StringStore, UniStr from .utf8string cimport StringStore, UniStr
@ -45,5 +45,5 @@ cdef class Language:
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _save_cached(self, const Lexeme* const* tokens, hash_t key, int n) except -1 cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1

View File

@ -18,7 +18,7 @@ from preshed.maps cimport PreshMap
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport init as lexeme_init from .lexeme cimport init as lexeme_init
from .lexeme cimport check_flag, IS_ALPHA from .lexeme cimport check_flag
from .utf8string cimport slice_unicode from .utf8string cimport slice_unicode
@ -114,7 +114,7 @@ cdef class Language:
orig_size = tokens.length orig_size = tokens.length
self._split_affixes(span, &prefixes, &suffixes) self._split_affixes(span, &prefixes, &suffixes)
self._attach_tokens(tokens, start, span, &prefixes, &suffixes) self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
self._save_cached(&tokens.lex[orig_size], orig_key, tokens.length - orig_size) self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)
cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes, cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes,
vector[const Lexeme*] *suffixes) except NULL: vector[const Lexeme*] *suffixes) except NULL:
@ -189,14 +189,14 @@ cdef class Language:
idx = tokens.push_back(idx, deref(it)) idx = tokens.push_back(idx, deref(it))
preinc(it) preinc(it)
cdef int _save_cached(self, const Lexeme* const* tokens, hash_t key, int n) except -1: cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1:
cdef int i cdef int i
for i in range(n): for i in range(n):
if tokens[i].id == 1: if tokens[i].lex.id == 1:
return 0 return 0
lexemes = <const Lexeme**>self.mem.alloc(n + 1, sizeof(Lexeme**)) lexemes = <const Lexeme**>self.mem.alloc(n + 1, sizeof(Lexeme**))
for i in range(n): for i in range(n):
lexemes[i] = tokens[i] lexemes[i] = tokens[i].lex
lexemes[i + 1] = NULL lexemes[i + 1] = NULL
self._cache.set(key, lexemes) self._cache.set(key, lexemes)
@ -255,7 +255,9 @@ cdef class Lexicon:
self.set_flags = set_flags self.set_flags = set_flags
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL: cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
'''Retrieve a pointer to a Lexeme from the lexicon.''' '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
cdef Lexeme* lex cdef Lexeme* lex
lex = <Lexeme*>self._map.get(string.key) lex = <Lexeme*>self._map.get(string.key)
if lex != NULL: if lex != NULL:

View File

@ -9,18 +9,22 @@ from .typedefs cimport flags_t
from .utf8string cimport StringStore from .utf8string cimport StringStore
cdef struct TokenC:
const Lexeme* lex
int idx
int pos
int sense
cdef TokenC EMPTY_TOKEN = TokenC(&EMPTY_LEXEME, 0, 0, 0)
cdef class Tokens: cdef class Tokens:
cdef Pool mem cdef Pool mem
cdef StringStore _string_store cdef StringStore _string_store
cdef const Lexeme** _lex_ptr cdef TokenC* _data
cdef int* _idx_ptr cdef TokenC* data
cdef int* _pos_ptr
cdef int* _ner_ptr
cdef const Lexeme** lex
cdef int* idx
cdef int* pos
cdef int* ner
cdef int length cdef int length
cdef int max_length cdef int max_length

View File

@ -40,28 +40,18 @@ cdef class Tokens:
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
# However, we need to remember the true starting places, so that we can # However, we need to remember the true starting places, so that we can
# realloc. # realloc.
self._lex_ptr = <const Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*)) self._data = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
self._ner_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
self.lex = self._lex_ptr
self.idx = self._idx_ptr
self.pos = self._pos_ptr
self.ner = self._ner_ptr
cdef int i cdef int i
for i in range(size + (PADDING*2)): for i in range(size + (PADDING*2)):
self.lex[i] = &EMPTY_LEXEME self._data[i] = EMPTY_TOKEN
self.lex += PADDING self.data = self._data + PADDING
self.idx += PADDING
self.pos += PADDING
self.ner += PADDING
self.max_length = size self.max_length = size
self.length = 0 self.length = 0
def __getitem__(self, i): def __getitem__(self, i):
bounds_check(i, self.length, PADDING) bounds_check(i, self.length, PADDING)
return Token(self._string_store, i, self.idx[i], self.pos[i], self.ner[i], return Token(self._string_store, i, self.data[i].idx, self.data[i].pos,
self.lex[i][0]) self.data[i].sense, self.data[i].lex[0])
def __iter__(self): def __iter__(self):
for i in range(self.length): for i in range(self.length):
@ -73,10 +63,11 @@ cdef class Tokens:
cdef int push_back(self, int idx, const Lexeme* lexeme) except -1: cdef int push_back(self, int idx, const Lexeme* lexeme) except -1:
if self.length == self.max_length: if self.length == self.max_length:
self._realloc(self.length * 2) self._realloc(self.length * 2)
self.lex[self.length] = lexeme cdef TokenC* t = &self.data[self.length]
self.idx[self.length] = idx t.lex = lexeme
self.pos[self.length] = 0 t.idx = idx
self.ner[self.length] = 0 t.pos = 0
t.sense = 0
self.length += 1 self.length += 1
return idx + lexeme.length return idx + lexeme.length
@ -108,7 +99,7 @@ cdef class Tokens:
output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int) output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int)
for i in range(self.length): for i in range(self.length):
for j, feature in enumerate(attr_ids): for j, feature in enumerate(attr_ids):
output[i, j] = get_attr(self.lex[i], feature) output[i, j] = get_attr(self.data[i].lex, feature)
return output return output
def count_by(self, attr_id_t attr_id): def count_by(self, attr_id_t attr_id):
@ -118,23 +109,18 @@ cdef class Tokens:
cdef PreshCounter counts = PreshCounter(2 ** 8) cdef PreshCounter counts = PreshCounter(2 ** 8)
for i in range(self.length): for i in range(self.length):
attr = get_attr(self.lex[i], attr_id) attr = get_attr(self.data[i].lex, attr_id)
counts.inc(attr, 1) counts.inc(attr, 1)
return dict(counts) return dict(counts)
def _realloc(self, new_size): def _realloc(self, new_size):
self.max_length = new_size self.max_length = new_size
n = new_size + (PADDING * 2) n = new_size + (PADDING * 2)
self._lex_ptr = <const Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*)) self._data = <TokenC*>self.mem.realloc(self._data, n * sizeof(TokenC))
self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int)) self.data = self._data + PADDING
self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int)) cdef int i
self._ner_ptr = <int*>self.mem.realloc(self._ner_ptr, n * sizeof(int))
self.lex = self._lex_ptr + PADDING
self.idx = self._idx_ptr + PADDING
self.pos = self._pos_ptr + PADDING
self.ner = self._ner_ptr + PADDING
for i in range(self.length, self.max_length + PADDING): for i in range(self.length, self.max_length + PADDING):
self.lex[i] = &EMPTY_LEXEME self.data[i] = EMPTY_TOKEN
@cython.freelist(64) @cython.freelist(64)