2015-06-08 23:39:54 +00:00
|
|
|
from libc.string cimport memcpy, memset
|
|
|
|
from libc.stdint cimport uint32_t
|
2015-06-09 19:20:14 +00:00
|
|
|
from ..vocab cimport EMPTY_LEXEME
|
2015-06-10 02:20:23 +00:00
|
|
|
from ..structs cimport Entity
|
2016-01-19 01:54:15 +00:00
|
|
|
from ..lexeme cimport Lexeme
|
|
|
|
from ..symbols cimport punct
|
|
|
|
from ..attrs cimport IS_SPACE
|
2015-06-08 23:39:54 +00:00
|
|
|
|
|
|
|
|
|
|
|
cdef class StateClass:
|
2015-06-09 19:20:14 +00:00
|
|
|
def __init__(self, int length):
|
|
|
|
cdef Pool mem = Pool()
|
2015-06-28 08:36:14 +00:00
|
|
|
cdef int PADDING = 5
|
|
|
|
self._buffer = <int*>mem.alloc(length + (PADDING * 2), sizeof(int))
|
|
|
|
self._stack = <int*>mem.alloc(length + (PADDING * 2), sizeof(int))
|
|
|
|
self.shifted = <bint*>mem.alloc(length + (PADDING * 2), sizeof(bint))
|
|
|
|
self._sent = <TokenC*>mem.alloc(length + (PADDING * 2), sizeof(TokenC))
|
|
|
|
self._ents = <Entity*>mem.alloc(length + (PADDING * 2), sizeof(Entity))
|
2015-06-16 21:35:21 +00:00
|
|
|
cdef int i
|
2015-06-28 08:36:14 +00:00
|
|
|
for i in range(length + (PADDING * 2)):
|
2015-06-16 21:35:21 +00:00
|
|
|
self._ents[i].end = -1
|
2015-09-09 01:39:46 +00:00
|
|
|
self._sent[i].l_edge = i
|
|
|
|
self._sent[i].r_edge = i
|
2015-06-28 08:36:14 +00:00
|
|
|
for i in range(length, length + (PADDING * 2)):
|
2015-06-23 01:03:22 +00:00
|
|
|
self._sent[i].lex = &EMPTY_LEXEME
|
2015-06-28 08:36:14 +00:00
|
|
|
self._sent += PADDING
|
|
|
|
self._ents += PADDING
|
|
|
|
self._buffer += PADDING
|
|
|
|
self._stack += PADDING
|
|
|
|
self.shifted += PADDING
|
2015-06-09 19:20:14 +00:00
|
|
|
self.mem = mem
|
|
|
|
self.length = length
|
2015-06-10 12:08:30 +00:00
|
|
|
self._break = -1
|
2015-06-09 19:20:14 +00:00
|
|
|
self._s_i = 0
|
|
|
|
self._b_i = 0
|
2015-06-09 21:23:28 +00:00
|
|
|
self._e_i = 0
|
2015-06-09 19:20:14 +00:00
|
|
|
for i in range(length):
|
2015-06-08 23:39:54 +00:00
|
|
|
self._buffer[i] = i
|
2015-06-09 19:20:14 +00:00
|
|
|
self._empty_token.lex = &EMPTY_LEXEME
|
|
|
|
|
2015-08-08 21:32:42 +00:00
|
|
|
@property
|
|
|
|
def stack(self):
|
|
|
|
return {self.S(i) for i in range(self._s_i)}
|
|
|
|
|
|
|
|
@property
|
|
|
|
def queue(self):
|
2015-08-08 22:39:02 +00:00
|
|
|
return {self.B(i) for i in range(self._b_i)}
|
2015-08-08 21:32:42 +00:00
|
|
|
|
2015-06-09 21:23:28 +00:00
|
|
|
cdef int E(self, int i) nogil:
|
2016-02-01 00:16:14 +00:00
|
|
|
self.c.E(i)
|
2015-06-23 02:13:09 +00:00
|
|
|
if self._e_i <= 0 or self._e_i >= self.length:
|
2015-06-23 02:35:08 +00:00
|
|
|
return 0
|
2015-11-06 19:51:41 +00:00
|
|
|
if i < 0 or i >= self._e_i:
|
2015-06-23 02:35:08 +00:00
|
|
|
return 0
|
2015-11-06 20:35:28 +00:00
|
|
|
return self._ents[self._e_i - (i+1)].start
|
2015-06-09 21:23:28 +00:00
|
|
|
|
2015-06-08 23:39:54 +00:00
|
|
|
cdef int L(self, int i, int idx) nogil:
|
2016-02-01 00:16:14 +00:00
|
|
|
self.c.L(i, idx)
|
2015-06-09 19:20:14 +00:00
|
|
|
if idx < 1:
|
|
|
|
return -1
|
|
|
|
if i < 0 or i >= self.length:
|
2015-06-08 23:39:54 +00:00
|
|
|
return -1
|
2015-06-09 19:20:14 +00:00
|
|
|
cdef const TokenC* target = &self._sent[i]
|
2015-06-28 22:17:29 +00:00
|
|
|
if target.l_kids < idx:
|
|
|
|
return -1
|
2015-07-09 10:17:26 +00:00
|
|
|
cdef const TokenC* ptr = &self._sent[target.l_edge]
|
2015-06-09 19:20:14 +00:00
|
|
|
|
|
|
|
while ptr < target:
|
|
|
|
# If this head is still to the right of us, we can skip to it
|
|
|
|
# No token that's between this token and this head could be our
|
|
|
|
# child.
|
|
|
|
if (ptr.head >= 1) and (ptr + ptr.head) < target:
|
|
|
|
ptr += ptr.head
|
|
|
|
|
|
|
|
elif ptr + ptr.head == target:
|
|
|
|
idx -= 1
|
|
|
|
if idx == 0:
|
|
|
|
return ptr - self._sent
|
|
|
|
ptr += 1
|
|
|
|
else:
|
|
|
|
ptr += 1
|
|
|
|
return -1
|
2015-06-08 23:39:54 +00:00
|
|
|
|
|
|
|
cdef int R(self, int i, int idx) nogil:
|
2016-02-01 00:16:14 +00:00
|
|
|
self.c.R(i, idx)
|
2015-06-09 19:20:14 +00:00
|
|
|
if idx < 1:
|
|
|
|
return -1
|
|
|
|
if i < 0 or i >= self.length:
|
2015-06-08 23:39:54 +00:00
|
|
|
return -1
|
2015-06-09 19:20:14 +00:00
|
|
|
cdef const TokenC* target = &self._sent[i]
|
2015-06-28 22:17:29 +00:00
|
|
|
if target.r_kids < idx:
|
|
|
|
return -1
|
2015-07-09 10:17:26 +00:00
|
|
|
cdef const TokenC* ptr = &self._sent[target.r_edge]
|
2015-06-09 19:20:14 +00:00
|
|
|
while ptr > target:
|
|
|
|
# If this head is still to the right of us, we can skip to it
|
|
|
|
# No token that's between this token and this head could be our
|
|
|
|
# child.
|
|
|
|
if (ptr.head < 0) and ((ptr + ptr.head) > target):
|
|
|
|
ptr += ptr.head
|
|
|
|
elif ptr + ptr.head == target:
|
|
|
|
idx -= 1
|
|
|
|
if idx == 0:
|
|
|
|
return ptr - self._sent
|
|
|
|
ptr -= 1
|
|
|
|
else:
|
|
|
|
ptr -= 1
|
|
|
|
return -1
|
|
|
|
|
2015-06-08 23:39:54 +00:00
|
|
|
cdef void push(self) nogil:
|
2016-02-01 00:16:14 +00:00
|
|
|
self.c.push()
|
2015-06-23 02:13:09 +00:00
|
|
|
if self.B(0) != -1:
|
|
|
|
self._stack[self._s_i] = self.B(0)
|
2015-06-08 23:39:54 +00:00
|
|
|
self._s_i += 1
|
|
|
|
self._b_i += 1
|
2015-06-11 23:50:23 +00:00
|
|
|
if self._b_i > self._break:
|
2015-06-10 12:08:30 +00:00
|
|
|
self._break = -1
|
2015-06-08 23:39:54 +00:00
|
|
|
|
|
|
|
cdef void pop(self) nogil:
|
2016-02-01 00:16:14 +00:00
|
|
|
self.c.pop()
|
2015-06-23 02:13:09 +00:00
|
|
|
if self._s_i >= 1:
|
|
|
|
self._s_i -= 1
|
2015-06-08 23:39:54 +00:00
|
|
|
|
2015-06-10 08:13:03 +00:00
|
|
|
cdef void unshift(self) nogil:
|
2016-02-01 00:16:14 +00:00
|
|
|
self.c.unshift()
|
2015-06-10 08:13:03 +00:00
|
|
|
self._b_i -= 1
|
|
|
|
self._buffer[self._b_i] = self.S(0)
|
|
|
|
self._s_i -= 1
|
2015-06-10 09:33:09 +00:00
|
|
|
self.shifted[self.B(0)] = True
|
2015-06-10 08:13:03 +00:00
|
|
|
|
2015-06-10 12:08:30 +00:00
|
|
|
cdef void fast_forward(self) nogil:
|
2016-02-01 00:16:14 +00:00
|
|
|
self.c.fast_forward()
|
2016-01-19 01:54:15 +00:00
|
|
|
while self.buffer_length() == 0 \
|
|
|
|
or self.stack_depth() == 0 \
|
|
|
|
or Lexeme.c_check_flag(self.S_(0).lex, IS_SPACE):
|
2015-06-10 12:08:30 +00:00
|
|
|
if self.buffer_length() == 1 and self.stack_depth() == 0:
|
|
|
|
self.push()
|
|
|
|
self.pop()
|
|
|
|
elif self.buffer_length() == 0 and self.stack_depth() == 1:
|
|
|
|
self.pop()
|
|
|
|
elif self.buffer_length() == 0 and self.stack_depth() >= 2:
|
|
|
|
if self.has_head(self.S(0)):
|
|
|
|
self.pop()
|
|
|
|
else:
|
|
|
|
self.unshift()
|
2015-06-11 23:50:23 +00:00
|
|
|
elif (self.length - self._b_i) >= 1 and self.stack_depth() == 0:
|
2015-06-10 12:08:30 +00:00
|
|
|
self.push()
|
2016-01-19 01:54:15 +00:00
|
|
|
elif Lexeme.c_check_flag(self.S_(0).lex, IS_SPACE):
|
|
|
|
self.add_arc(self.B(0), self.S(0), 0)
|
|
|
|
self.pop()
|
2015-06-10 12:08:30 +00:00
|
|
|
else:
|
|
|
|
break
|
|
|
|
|
2015-06-08 23:39:54 +00:00
|
|
|
cdef void add_arc(self, int head, int child, int label) nogil:
|
2016-02-01 00:16:14 +00:00
|
|
|
self.c.add_arc(head, child, label)
|
2015-06-08 23:39:54 +00:00
|
|
|
if self.has_head(child):
|
|
|
|
self.del_arc(self.H(child), child)
|
|
|
|
|
|
|
|
cdef int dist = head - child
|
|
|
|
self._sent[child].head = dist
|
|
|
|
self._sent[child].dep = label
|
2015-06-23 13:50:56 +00:00
|
|
|
cdef int i
|
2015-06-08 23:39:54 +00:00
|
|
|
if child > head:
|
2015-06-14 15:50:26 +00:00
|
|
|
self._sent[head].r_kids += 1
|
2015-06-24 02:28:02 +00:00
|
|
|
# Some transition systems can have a word in the buffer have a
|
|
|
|
# rightward child, e.g. from Unshift.
|
|
|
|
self._sent[head].r_edge = self._sent[child].r_edge
|
2015-06-23 13:50:56 +00:00
|
|
|
i = 0
|
|
|
|
while self.has_head(head) and i < self.length:
|
|
|
|
head = self.H(head)
|
2015-06-24 02:28:02 +00:00
|
|
|
self._sent[head].r_edge = self._sent[child].r_edge
|
2015-06-23 13:50:56 +00:00
|
|
|
i += 1 # Guard against infinite loops
|
2015-06-08 23:39:54 +00:00
|
|
|
else:
|
2015-06-14 15:50:26 +00:00
|
|
|
self._sent[head].l_kids += 1
|
2015-06-23 13:50:56 +00:00
|
|
|
self._sent[head].l_edge = self._sent[child].l_edge
|
|
|
|
|
|
|
|
cdef void del_arc(self, int h_i, int c_i) nogil:
|
2016-02-01 00:16:14 +00:00
|
|
|
self.c.del_arc(h_i, c_i)
|
2015-06-23 13:50:56 +00:00
|
|
|
cdef int dist = h_i - c_i
|
|
|
|
cdef TokenC* h = &self._sent[h_i]
|
|
|
|
if c_i > h_i:
|
2015-09-09 01:39:46 +00:00
|
|
|
h.r_edge = self.R_(h_i, 2).r_edge if h.r_kids >= 2 else h_i
|
2015-06-23 13:50:56 +00:00
|
|
|
h.r_kids -= 1
|
2015-06-08 23:39:54 +00:00
|
|
|
else:
|
2015-09-09 01:39:46 +00:00
|
|
|
h.l_edge = self.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i
|
2015-06-23 13:50:56 +00:00
|
|
|
h.l_kids -= 1
|
2015-06-08 23:39:54 +00:00
|
|
|
|
2015-06-09 21:23:28 +00:00
|
|
|
cdef void open_ent(self, int label) nogil:
|
2016-02-01 00:16:14 +00:00
|
|
|
self.c.open_ent(label)
|
2015-06-23 02:35:08 +00:00
|
|
|
self._ents[self._e_i].start = self.B(0)
|
|
|
|
self._ents[self._e_i].label = label
|
|
|
|
self._ents[self._e_i].end = -1
|
|
|
|
self._e_i += 1
|
2015-06-09 21:23:28 +00:00
|
|
|
|
|
|
|
cdef void close_ent(self) nogil:
|
2016-02-01 00:16:14 +00:00
|
|
|
self.c.close_ent()
|
2015-11-06 19:51:41 +00:00
|
|
|
# Note that we don't decrement _e_i here! We want to maintain all
|
|
|
|
# entities, not over-write them...
|
|
|
|
self._ents[self._e_i-1].end = self.B(0)+1
|
2015-06-09 21:23:28 +00:00
|
|
|
self._sent[self.B(0)].ent_iob = 1
|
|
|
|
|
|
|
|
cdef void set_ent_tag(self, int i, int ent_iob, int ent_type) nogil:
|
2016-02-01 00:16:14 +00:00
|
|
|
self.c.set_ent_tag(i, ent_iob, ent_type)
|
2015-06-09 21:23:28 +00:00
|
|
|
if 0 <= i < self.length:
|
|
|
|
self._sent[i].ent_iob = ent_iob
|
|
|
|
self._sent[i].ent_type = ent_type
|
|
|
|
|
2015-06-11 23:50:23 +00:00
|
|
|
cdef void set_break(self, int _) nogil:
|
2016-02-01 00:16:14 +00:00
|
|
|
self.c.set_break(_)
|
2015-06-23 02:13:09 +00:00
|
|
|
if 0 <= self.B(0) < self.length:
|
2015-06-23 03:39:23 +00:00
|
|
|
self._sent[self.B(0)].sent_start = True
|
2015-06-23 02:13:09 +00:00
|
|
|
self._break = self._b_i
|
2015-06-08 23:39:54 +00:00
|
|
|
|
|
|
|
cdef void clone(self, StateClass src) nogil:
|
2016-02-01 00:16:14 +00:00
|
|
|
self.c.clone(src.c)
|
2015-06-08 23:39:54 +00:00
|
|
|
memcpy(self._sent, src._sent, self.length * sizeof(TokenC))
|
|
|
|
memcpy(self._stack, src._stack, self.length * sizeof(int))
|
|
|
|
memcpy(self._buffer, src._buffer, self.length * sizeof(int))
|
2015-06-10 02:20:23 +00:00
|
|
|
memcpy(self._ents, src._ents, self.length * sizeof(Entity))
|
2015-06-08 23:39:54 +00:00
|
|
|
self._b_i = src._b_i
|
|
|
|
self._s_i = src._s_i
|
2015-06-09 21:23:28 +00:00
|
|
|
self._e_i = src._e_i
|
2015-06-23 02:13:09 +00:00
|
|
|
self._break = src._break
|
2015-06-09 23:35:28 +00:00
|
|
|
|
|
|
|
def print_state(self, words):
|
|
|
|
words = list(words) + ['_']
|
2015-06-10 08:13:03 +00:00
|
|
|
top = words[self.S(0)] + '_%d' % self.S_(0).head
|
|
|
|
second = words[self.S(1)] + '_%d' % self.S_(1).head
|
|
|
|
third = words[self.S(2)] + '_%d' % self.S_(2).head
|
2015-06-09 23:35:28 +00:00
|
|
|
n0 = words[self.B(0)]
|
|
|
|
n1 = words[self.B(1)]
|
2015-06-14 15:44:29 +00:00
|
|
|
return ' '.join((third, second, top, '|', n0, n1))
|