mirror of https://github.com/explosion/spaCy.git
* Fix EMPTY_TOKEN
This commit is contained in:
parent
3819a88e1b
commit
9f17467c2e
|
@ -16,9 +16,6 @@ cdef struct TokenC:
|
||||||
int sense
|
int sense
|
||||||
|
|
||||||
|
|
||||||
cdef TokenC EMPTY_TOKEN = TokenC(&EMPTY_LEXEME, 0, 0, 0)
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokens:
|
cdef class Tokens:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef StringStore _string_store
|
cdef StringStore _string_store
|
||||||
|
|
|
@ -43,7 +43,7 @@ cdef class Tokens:
|
||||||
data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
|
data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(size + (PADDING*2)):
|
for i in range(size + (PADDING*2)):
|
||||||
data_start[i] = EMPTY_TOKEN
|
data_start[i].lex = &EMPTY_LEXEME
|
||||||
self.data = data_start + PADDING
|
self.data = data_start + PADDING
|
||||||
self.max_length = size
|
self.max_length = size
|
||||||
self.length = 0
|
self.length = 0
|
||||||
|
@ -86,10 +86,7 @@ cdef class Tokens:
|
||||||
return idx
|
return idx
|
||||||
|
|
||||||
cpdef int set_tag(self, int i, int tag_type, int tag) except -1:
|
cpdef int set_tag(self, int i, int tag_type, int tag) except -1:
|
||||||
if tag_type == POS:
|
self.data[i].pos = tag
|
||||||
self.pos[i] = tag
|
|
||||||
elif tag_type == ENTITY:
|
|
||||||
self.ner[i] = tag
|
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids):
|
cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids):
|
||||||
|
@ -116,12 +113,17 @@ cdef class Tokens:
|
||||||
def _realloc(self, new_size):
|
def _realloc(self, new_size):
|
||||||
self.max_length = new_size
|
self.max_length = new_size
|
||||||
n = new_size + (PADDING * 2)
|
n = new_size + (PADDING * 2)
|
||||||
|
# What we're storing is a "padded" array. We've jumped forward PADDING
|
||||||
|
# places, and are storing the pointer to that. This way, we can access
|
||||||
|
# words out-of-bounds, and get out-of-bounds markers.
|
||||||
|
# Now that we want to realloc, we need the address of the true start,
|
||||||
|
# so we jump the pointer back PADDING places.
|
||||||
cdef TokenC* data_start = self.data - PADDING
|
cdef TokenC* data_start = self.data - PADDING
|
||||||
data_start = <TokenC*>self.mem.realloc(data_start, n * sizeof(TokenC))
|
data_start = <TokenC*>self.mem.realloc(data_start, n * sizeof(TokenC))
|
||||||
self.data = data_start + PADDING
|
self.data = data_start + PADDING
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(self.length, self.max_length + PADDING):
|
for i in range(self.length, self.max_length + PADDING):
|
||||||
self.data[i] = EMPTY_TOKEN
|
self.data[i].lex = &EMPTY_LEXEME
|
||||||
|
|
||||||
|
|
||||||
@cython.freelist(64)
|
@cython.freelist(64)
|
||||||
|
|
Loading…
Reference in New Issue