spaCy/spacy/syntax/_state.pxd

426 lines
14 KiB
Cython
Raw Normal View History

from libc.string cimport memcpy, memset
from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint32_t, uint64_t
from murmurhash.mrmr cimport hash64
from ..vocab cimport EMPTY_LEXEME
from ..structs cimport TokenC, Entity
from ..lexeme cimport Lexeme
from ..symbols cimport punct
from ..attrs cimport IS_SPACE
2017-05-28 16:09:27 +00:00
from ..typedefs cimport attr_t
cdef inline bint is_space_token(const TokenC* token) nogil:
return Lexeme.c_check_flag(token.lex, IS_SPACE)
cdef cppclass StateC:
int* _stack
int* _buffer
bint* shifted
TokenC* _sent
Entity* _ents
TokenC _empty_token
int length
int offset
int _s_i
int _b_i
int _e_i
int _break
__init__(const TokenC* sent, int length) nogil:
cdef int PADDING = 5
this._buffer = <int*>calloc(length + (PADDING * 2), sizeof(int))
this._stack = <int*>calloc(length + (PADDING * 2), sizeof(int))
this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC))
this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity))
cdef int i
for i in range(length + (PADDING * 2)):
this._ents[i].end = -1
this._sent[i].l_edge = i
this._sent[i].r_edge = i
2017-03-11 12:58:34 +00:00
for i in range(PADDING):
this._sent[i].lex = &EMPTY_LEXEME
this._sent += PADDING
this._ents += PADDING
this._buffer += PADDING
this._stack += PADDING
this.shifted += PADDING
this.length = length
this._break = -1
this._s_i = 0
this._b_i = 0
this._e_i = 0
for i in range(length):
this._buffer[i] = i
memset(&this._empty_token, 0, sizeof(TokenC))
this._empty_token.lex = &EMPTY_LEXEME
for i in range(length):
this._sent[i] = sent[i]
this._buffer[i] = i
for i in range(length, length+PADDING):
this._sent[i].lex = &EMPTY_LEXEME
__dealloc__():
cdef int PADDING = 5
free(this._sent - PADDING)
free(this._ents - PADDING)
free(this._buffer - PADDING)
free(this._stack - PADDING)
free(this.shifted - PADDING)
void set_context_tokens(int* ids, int n) nogil:
if n == 13:
ids[0] = this.B(0)
ids[1] = this.B(1)
ids[2] = this.S(0)
ids[3] = this.S(1)
ids[4] = this.S(2)
ids[5] = this.L(this.S(0), 1)
ids[6] = this.L(this.S(0), 2)
ids[6] = this.R(this.S(0), 1)
ids[7] = this.L(this.B(0), 1)
ids[8] = this.R(this.S(0), 2)
ids[9] = this.L(this.S(1), 1)
ids[10] = this.L(this.S(1), 2)
ids[11] = this.R(this.S(1), 1)
ids[12] = this.R(this.S(1), 2)
elif n == 6:
2017-05-30 20:12:19 +00:00
if this.B(0) >= 0:
ids[0] = this.B(0)
else:
ids[0] = -1
ids[1] = this.B(0)
ids[2] = this.B(1)
ids[3] = this.E(0)
2017-05-30 20:12:19 +00:00
if ids[3] >= 1:
ids[4] = this.E(0)-1
else:
ids[4] = -1
2017-05-31 01:01:02 +00:00
if (ids[3]+1) < this.length:
2017-05-30 20:12:19 +00:00
ids[5] = this.E(0)+1
else:
ids[5] = -1
else:
# TODO error =/
pass
for i in range(n):
if ids[i] >= 0:
ids[i] += this.offset
int S(int i) nogil const:
if i >= this._s_i:
return -1
return this._stack[this._s_i - (i+1)]
int B(int i) nogil const:
if (i + this._b_i) >= this.length:
return -1
return this._buffer[this._b_i + i]
const TokenC* S_(int i) nogil const:
return this.safe_get(this.S(i))
const TokenC* B_(int i) nogil const:
return this.safe_get(this.B(i))
const TokenC* H_(int i) nogil const:
return this.safe_get(this.H(i))
const TokenC* E_(int i) nogil const:
return this.safe_get(this.E(i))
const TokenC* L_(int i, int idx) nogil const:
return this.safe_get(this.L(i, idx))
const TokenC* R_(int i, int idx) nogil const:
return this.safe_get(this.R(i, idx))
const TokenC* safe_get(int i) nogil const:
if i < 0 or i >= this.length:
return &this._empty_token
else:
return &this._sent[i]
int H(int i) nogil const:
if i < 0 or i >= this.length:
return -1
return this._sent[i].head + i
int E(int i) nogil const:
if this._e_i <= 0 or this._e_i >= this.length:
return 0
if i < 0 or i >= this._e_i:
return 0
return this._ents[this._e_i - (i+1)].start
int L(int i, int idx) nogil const:
if idx < 1:
return -1
if i < 0 or i >= this.length:
return -1
cdef const TokenC* target = &this._sent[i]
if target.l_kids < <uint32_t>idx:
return -1
cdef const TokenC* ptr = &this._sent[target.l_edge]
while ptr < target:
# If this head is still to the right of us, we can skip to it
# No token that's between this token and this head could be our
# child.
if (ptr.head >= 1) and (ptr + ptr.head) < target:
ptr += ptr.head
elif ptr + ptr.head == target:
idx -= 1
if idx == 0:
return ptr - this._sent
ptr += 1
else:
ptr += 1
return -1
int R(int i, int idx) nogil const:
if idx < 1:
return -1
if i < 0 or i >= this.length:
return -1
cdef const TokenC* target = &this._sent[i]
if target.r_kids < <uint32_t>idx:
return -1
cdef const TokenC* ptr = &this._sent[target.r_edge]
while ptr > target:
# If this head is still to the right of us, we can skip to it
# No token that's between this token and this head could be our
# child.
if (ptr.head < 0) and ((ptr + ptr.head) > target):
ptr += ptr.head
elif ptr + ptr.head == target:
idx -= 1
if idx == 0:
return ptr - this._sent
ptr -= 1
else:
ptr -= 1
return -1
bint empty() nogil const:
return this._s_i <= 0
bint eol() nogil const:
return this.buffer_length() == 0
bint at_break() nogil const:
return this._break != -1
bint is_final() nogil const:
return this.stack_depth() <= 0 and this._b_i >= this.length
bint has_head(int i) nogil const:
return this.safe_get(i).head != 0
int n_L(int i) nogil const:
return this.safe_get(i).l_kids
int n_R(int i) nogil const:
return this.safe_get(i).r_kids
bint stack_is_connected() nogil const:
return False
bint entity_is_open() nogil const:
if this._e_i < 1:
return False
return this._ents[this._e_i-1].end == -1
int stack_depth() nogil const:
return this._s_i
int buffer_length() nogil const:
if this._break != -1:
return this._break - this._b_i
else:
return this.length - this._b_i
uint64_t hash() nogil const:
cdef TokenC[11] sig
sig[0] = this.S_(2)[0]
sig[1] = this.S_(1)[0]
sig[2] = this.R_(this.S(1), 1)[0]
sig[3] = this.L_(this.S(0), 1)[0]
sig[4] = this.L_(this.S(0), 2)[0]
sig[5] = this.S_(0)[0]
sig[6] = this.R_(this.S(0), 2)[0]
sig[7] = this.R_(this.S(0), 1)[0]
sig[8] = this.B_(0)[0]
sig[9] = this.E_(0)[0]
sig[10] = this.E_(1)[0]
return hash64(sig, sizeof(sig), this._s_i)
void push() nogil:
if this.B(0) != -1:
this._stack[this._s_i] = this.B(0)
this._s_i += 1
this._b_i += 1
if this._b_i > this._break:
this._break = -1
void pop() nogil:
if this._s_i >= 1:
this._s_i -= 1
void unshift() nogil:
this._b_i -= 1
this._buffer[this._b_i] = this.S(0)
this._s_i -= 1
this.shifted[this.B(0)] = True
2017-05-28 16:09:27 +00:00
void add_arc(int head, int child, attr_t label) nogil:
if this.has_head(child):
this.del_arc(this.H(child), child)
cdef int dist = head - child
this._sent[child].head = dist
this._sent[child].dep = label
cdef int i
if child > head:
this._sent[head].r_kids += 1
# Some transition systems can have a word in the buffer have a
# rightward child, e.g. from Unshift.
this._sent[head].r_edge = this._sent[child].r_edge
i = 0
while this.has_head(head) and i < this.length:
head = this.H(head)
this._sent[head].r_edge = this._sent[child].r_edge
i += 1 # Guard against infinite loops
else:
this._sent[head].l_kids += 1
this._sent[head].l_edge = this._sent[child].l_edge
void del_arc(int h_i, int c_i) nogil:
cdef int dist = h_i - c_i
cdef TokenC* h = &this._sent[h_i]
cdef int i = 0
if c_i > h_i:
# this.R_(h_i, 2) returns the second-rightmost child token of h_i
# If we have more than 2 rightmost children, our 2nd rightmost child's
# rightmost edge is going to be our new rightmost edge.
h.r_edge = this.R_(h_i, 2).r_edge if h.r_kids >= 2 else h_i
h.r_kids -= 1
new_edge = h.r_edge
# Correct upwards in the tree --- see Issue #251
while h.head < 0 and i < this.length: # Guard infinite loop
h += h.head
h.r_edge = new_edge
i += 1
else:
# Same logic applies for left edge, but we don't need to walk up
# the tree, as the head is off the stack.
h.l_edge = this.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i
h.l_kids -= 1
2017-05-28 16:09:27 +00:00
void open_ent(attr_t label) nogil:
this._ents[this._e_i].start = this.B(0)
this._ents[this._e_i].label = label
this._ents[this._e_i].end = -1
this._e_i += 1
void close_ent() nogil:
# Note that we don't decrement _e_i here! We want to maintain all
# entities, not over-write them...
this._ents[this._e_i-1].end = this.B(0)+1
this._sent[this.B(0)].ent_iob = 1
2017-05-28 16:09:27 +00:00
void set_ent_tag(int i, int ent_iob, attr_t ent_type) nogil:
if 0 <= i < this.length:
this._sent[i].ent_iob = ent_iob
this._sent[i].ent_type = ent_type
void set_break(int i) nogil:
if 0 <= i < this.length:
2016-05-05 13:15:34 +00:00
this._sent[i].sent_start = True
this._break = this._b_i
void clone(const StateC* src) nogil:
2017-05-27 20:49:37 +00:00
this.length = src.length
memcpy(this._sent, src._sent, this.length * sizeof(TokenC))
memcpy(this._stack, src._stack, this.length * sizeof(int))
memcpy(this._buffer, src._buffer, this.length * sizeof(int))
memcpy(this._ents, src._ents, this.length * sizeof(Entity))
memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
this._b_i = src._b_i
this._s_i = src._s_i
this._e_i = src._e_i
this._break = src._break
2017-05-26 16:31:23 +00:00
this.offset = src.offset
2017-05-27 20:49:37 +00:00
this._empty_token = src._empty_token
void fast_forward() nogil:
# space token attachement policy:
# - attach space tokens always to the last preceding real token
# - except if it's the beginning of a sentence, then attach to the first following
# - boundary case: a document containing multiple space tokens but nothing else,
# then make the last space token the head of all others
while is_space_token(this.B_(0)) \
or this.buffer_length() == 0 \
or this.stack_depth() == 0:
if this.buffer_length() == 0:
# remove the last sentence's root from the stack
if this.stack_depth() == 1:
this.pop()
# parser got stuck: reduce stack or unshift
elif this.stack_depth() > 1:
if this.has_head(this.S(0)):
this.pop()
else:
this.unshift()
# stack is empty but there is another sentence on the buffer
elif (this.length - this._b_i) >= 1:
this.push()
else: # stack empty and nothing else coming
break
elif is_space_token(this.B_(0)):
# the normal case: we're somewhere inside a sentence
if this.stack_depth() > 0:
# assert not is_space_token(this.S_(0))
# attach all coming space tokens to their last preceding
# real token (which should be on the top of the stack)
while is_space_token(this.B_(0)):
this.add_arc(this.S(0),this.B(0),0)
this.push()
this.pop()
# the rare case: we're at the beginning of a document:
# space tokens are attached to the first real token on the buffer
elif this.stack_depth() == 0:
# store all space tokens on the stack until a real token shows up
# or the last token on the buffer is reached
while is_space_token(this.B_(0)) and this.buffer_length() > 1:
this.push()
# empty the stack by attaching all space tokens to the
# first token on the buffer
# boundary case: if all tokens are space tokens, the last one
# becomes the head of all others
while this.stack_depth() > 0:
this.add_arc(this.B(0),this.S(0),0)
this.pop()
# move the first token onto the stack
this.push()
elif this.stack_depth() == 0:
# for one token sentences (?)
if this.buffer_length() == 1:
this.push()
this.pop()
# with an empty stack and a non-empty buffer
# only shift is valid anyway
elif (this.length - this._b_i) >= 1:
this.push()
else: # can this even happen?
break