mirror of https://github.com/explosion/spaCy.git
* Remove deprecated _state module
This commit is contained in:
parent
9ab9dd2bf7
commit
88f55d136b
|
@ -1,114 +0,0 @@
|
||||||
from libc.stdint cimport uint32_t
|
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
|
|
||||||
from ..structs cimport TokenC, Entity, Constituent
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct State:
|
|
||||||
TokenC* sent
|
|
||||||
int* stack
|
|
||||||
Entity* ent
|
|
||||||
int i
|
|
||||||
int sent_len
|
|
||||||
int stack_len
|
|
||||||
int ents_len
|
|
||||||
|
|
||||||
|
|
||||||
cdef int add_dep(State *s, const int head, const int child, const int label) except -1
|
|
||||||
|
|
||||||
|
|
||||||
cdef int pop_stack(State *s) except -1
|
|
||||||
cdef int push_stack(State *s) except -1
|
|
||||||
|
|
||||||
|
|
||||||
cdef bint has_head(const TokenC* t) nogil
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline int get_idx(const State* s, const TokenC* t) nogil:
|
|
||||||
return t - s.sent
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline TokenC* get_n0(const State* s) nogil:
|
|
||||||
return &s.sent[s.i]
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline TokenC* get_n1(const State* s) nogil:
|
|
||||||
if (s.i+1) >= s.sent_len:
|
|
||||||
return NULL
|
|
||||||
else:
|
|
||||||
return &s.sent[s.i+1]
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline TokenC* get_p1(const State* s) nogil:
|
|
||||||
if s.i < 1:
|
|
||||||
return NULL
|
|
||||||
else:
|
|
||||||
return &s.sent[s.i-1]
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline TokenC* get_p2(const State* s) nogil:
|
|
||||||
if s.i < 2:
|
|
||||||
return NULL
|
|
||||||
else:
|
|
||||||
return &s.sent[s.i-2]
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline TokenC* get_e0(const State* s) nogil:
|
|
||||||
if s.ent.end != 0:
|
|
||||||
return NULL
|
|
||||||
else:
|
|
||||||
return &s.sent[s.ent.start]
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline TokenC* get_e1(const State* s) nogil:
|
|
||||||
if s.ent.end != 0 or s.ent.start >= (s.i + 1):
|
|
||||||
return NULL
|
|
||||||
else:
|
|
||||||
return &s.sent[s.ent.start + 1]
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline TokenC* get_n2(const State* s) nogil:
|
|
||||||
if (s.i + 2) >= s.sent_len:
|
|
||||||
return NULL
|
|
||||||
else:
|
|
||||||
return &s.sent[s.i+2]
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline TokenC* get_s0(const State *s) nogil:
|
|
||||||
return &s.sent[s.stack[0]]
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline TokenC* get_s1(const State *s) nogil:
|
|
||||||
# Rely on our padding to ensure we don't go out of bounds here
|
|
||||||
return &s.sent[s.stack[-1]]
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline TokenC* get_s2(const State *s) nogil:
|
|
||||||
# Rely on our padding to ensure we don't go out of bounds here
|
|
||||||
return &s.sent[s.stack[-2]]
|
|
||||||
|
|
||||||
cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil
|
|
||||||
|
|
||||||
cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) nogil
|
|
||||||
|
|
||||||
cdef inline bint at_eol(const State *s) nogil:
|
|
||||||
return s.i >= s.sent_len
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline bint is_final(const State *s) nogil:
|
|
||||||
return at_eol(s) and s.stack_len < 2
|
|
||||||
|
|
||||||
|
|
||||||
cdef int children_in_buffer(const State *s, const int head, const int* gold) except -1
|
|
||||||
cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1
|
|
||||||
cdef int children_in_stack(const State *s, const int head, const int* gold) except -1
|
|
||||||
cdef int head_in_stack(const State *s, const int child, const int* gold) except -1
|
|
||||||
|
|
||||||
cdef State* new_state(Pool mem, const TokenC* sent, const int sent_length) except NULL
|
|
||||||
cdef int copy_state(State* dest, const State* src) except -1
|
|
||||||
|
|
||||||
cdef int count_left_kids(const TokenC* head) nogil
|
|
||||||
|
|
||||||
cdef int count_right_kids(const TokenC* head) nogil
|
|
|
@ -1,254 +0,0 @@
|
||||||
# cython: profile=True
|
|
||||||
from libc.string cimport memmove, memcpy
|
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
|
|
||||||
from ..lexeme cimport EMPTY_LEXEME
|
|
||||||
from ..structs cimport TokenC, Entity, Constituent
|
|
||||||
|
|
||||||
|
|
||||||
DEF PADDING = 5
|
|
||||||
DEF NON_MONOTONIC = True
|
|
||||||
|
|
||||||
|
|
||||||
cdef int add_dep(State *s, int head, int child, int label) except -1:
|
|
||||||
if has_head(&s.sent[child]):
|
|
||||||
del_dep(s, child + s.sent[child].head, child)
|
|
||||||
cdef int dist = head - child
|
|
||||||
s.sent[child].head = dist
|
|
||||||
s.sent[child].dep = label
|
|
||||||
# Keep a bit-vector tracking child dependencies. If a word has a child at
|
|
||||||
# offset i from it, set that bit (tracking left and right separately)
|
|
||||||
if child > head:
|
|
||||||
s.sent[head].r_kids |= 1 << (-dist)
|
|
||||||
s.sent[head].r_edge = child - head
|
|
||||||
# Walk up the tree, setting right edge
|
|
||||||
n_iter = 0
|
|
||||||
start = head
|
|
||||||
while s.sent[head].head != 0:
|
|
||||||
head += s.sent[head].head
|
|
||||||
s.sent[head].r_edge = child - head
|
|
||||||
n_iter += 1
|
|
||||||
if n_iter >= s.sent_len:
|
|
||||||
tree = [(i + s.sent[i].head) for i in range(s.sent_len)]
|
|
||||||
msg = "Error adding dependency (%d, %d). Could not find root of tree: %s"
|
|
||||||
msg = msg % (start, child, tree)
|
|
||||||
raise Exception(msg)
|
|
||||||
else:
|
|
||||||
s.sent[head].l_kids |= 1 << dist
|
|
||||||
s.sent[head].l_edge = (child + s.sent[child].l_edge) - head
|
|
||||||
|
|
||||||
|
|
||||||
cdef int del_dep(State *s, int head, int child) except -1:
|
|
||||||
cdef const TokenC* next_child
|
|
||||||
cdef int dist = head - child
|
|
||||||
if child > head:
|
|
||||||
s.sent[head].r_kids &= ~(1 << (-dist))
|
|
||||||
next_child = get_right(s, &s.sent[head], 1)
|
|
||||||
if next_child == NULL:
|
|
||||||
s.sent[head].r_edge = 0
|
|
||||||
else:
|
|
||||||
s.sent[head].r_edge = next_child.r_edge
|
|
||||||
else:
|
|
||||||
s.sent[head].l_kids &= ~(1 << dist)
|
|
||||||
next_child = get_left(s, &s.sent[head], 1)
|
|
||||||
if next_child == NULL:
|
|
||||||
s.sent[head].l_edge = 0
|
|
||||||
else:
|
|
||||||
s.sent[head].l_edge = next_child.l_edge
|
|
||||||
|
|
||||||
|
|
||||||
cdef int pop_stack(State *s) except -1:
|
|
||||||
assert s.stack_len >= 1
|
|
||||||
s.stack_len -= 1
|
|
||||||
s.stack -= 1
|
|
||||||
#if s.stack_len == 0 and not at_eol(s):
|
|
||||||
# push_stack(s)
|
|
||||||
|
|
||||||
|
|
||||||
cdef int push_stack(State *s) except -1:
|
|
||||||
assert s.i < s.sent_len
|
|
||||||
s.stack += 1
|
|
||||||
s.stack[0] = s.i
|
|
||||||
s.stack_len += 1
|
|
||||||
s.i += 1
|
|
||||||
|
|
||||||
|
|
||||||
cdef int children_in_buffer(const State *s, int head, const int* gold) except -1:
|
|
||||||
# Golds holds an array of head offsets --- the head of word i is i - golds[i]
|
|
||||||
# Iterate over the tokens of the queue, and check whether their gold head is
|
|
||||||
# our target
|
|
||||||
cdef int i
|
|
||||||
cdef int n = 0
|
|
||||||
for i in range(s.i, s.sent_len):
|
|
||||||
if gold[i] == head:
|
|
||||||
n += 1
|
|
||||||
elif gold[i] == i or gold[i] < head:
|
|
||||||
break
|
|
||||||
return n
|
|
||||||
|
|
||||||
|
|
||||||
cdef int head_in_buffer(const State *s, const int child, const int* gold) except -1:
|
|
||||||
return gold[child] >= s.i
|
|
||||||
|
|
||||||
|
|
||||||
cdef int children_in_stack(const State *s, const int head, const int* gold) except -1:
|
|
||||||
cdef int i
|
|
||||||
cdef int n = 0
|
|
||||||
for i in range(s.stack_len):
|
|
||||||
if gold[s.stack[-i]] == head:
|
|
||||||
if NON_MONOTONIC or not has_head(get_s0(s)):
|
|
||||||
n += 1
|
|
||||||
return n
|
|
||||||
|
|
||||||
|
|
||||||
cdef int head_in_stack(const State *s, const int child, const int* gold) except -1:
|
|
||||||
cdef int i
|
|
||||||
for i in range(s.stack_len):
|
|
||||||
if gold[child] == s.stack[-i]:
|
|
||||||
return 1
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
cdef bint has_head(const TokenC* t) nogil:
|
|
||||||
return t.head != 0
|
|
||||||
|
|
||||||
|
|
||||||
cdef const TokenC* get_left(const State* s, const TokenC* head, const int idx) nogil:
|
|
||||||
return _new_get_left(s, head, idx)
|
|
||||||
|
|
||||||
"""
|
|
||||||
cdef uint32_t kids = head.l_kids
|
|
||||||
if kids == 0:
|
|
||||||
return NULL
|
|
||||||
cdef int offset = _nth_significant_bit(kids, idx)
|
|
||||||
cdef const TokenC* child = head - offset
|
|
||||||
if child >= s.sent:
|
|
||||||
return child
|
|
||||||
else:
|
|
||||||
return NULL
|
|
||||||
"""
|
|
||||||
|
|
||||||
cdef const TokenC* get_right(const State* s, const TokenC* head, const int idx) nogil:
|
|
||||||
return _new_get_right(s, head, idx)
|
|
||||||
|
|
||||||
"""
|
|
||||||
cdef uint32_t kids = head.r_kids
|
|
||||||
if kids == 0:
|
|
||||||
return NULL
|
|
||||||
cdef int offset = _nth_significant_bit(kids, idx)
|
|
||||||
cdef const TokenC* child = head + offset
|
|
||||||
if child < (s.sent + s.sent_len):
|
|
||||||
return child
|
|
||||||
else:
|
|
||||||
return NULL
|
|
||||||
"""
|
|
||||||
|
|
||||||
cdef int count_left_kids(const TokenC* head) nogil:
|
|
||||||
return _popcount(head.l_kids)
|
|
||||||
|
|
||||||
|
|
||||||
cdef int count_right_kids(const TokenC* head) nogil:
|
|
||||||
return _popcount(head.r_kids)
|
|
||||||
|
|
||||||
|
|
||||||
cdef State* new_state(Pool mem, const TokenC* sent, const int sent_len) except NULL:
|
|
||||||
cdef int padded_len = sent_len + PADDING + PADDING
|
|
||||||
cdef State* s = <State*>mem.alloc(1, sizeof(State))
|
|
||||||
#s.ctnt = <Constituent*>mem.alloc(padded_len, sizeof(Constituent))
|
|
||||||
s.ent = <Entity*>mem.alloc(padded_len, sizeof(Entity))
|
|
||||||
s.stack = <int*>mem.alloc(padded_len, sizeof(int))
|
|
||||||
for i in range(PADDING):
|
|
||||||
s.stack[i] = -1
|
|
||||||
#s.ctnt += (PADDING -1)
|
|
||||||
s.stack += (PADDING - 1)
|
|
||||||
s.ent += (PADDING - 1)
|
|
||||||
assert s.stack[0] == -1
|
|
||||||
state_sent = <TokenC*>mem.alloc(padded_len, sizeof(TokenC))
|
|
||||||
memcpy(state_sent, sent - PADDING, padded_len * sizeof(TokenC))
|
|
||||||
s.sent = state_sent + PADDING
|
|
||||||
s.stack_len = 0
|
|
||||||
s.i = 0
|
|
||||||
s.sent_len = sent_len
|
|
||||||
return s
|
|
||||||
|
|
||||||
|
|
||||||
cdef int copy_state(State* dest, const State* src) except -1:
|
|
||||||
cdef int i
|
|
||||||
# Copy stack --- remember stack uses pointer arithmetic, so stack[-stack_len]
|
|
||||||
# is the last word of the stack.
|
|
||||||
dest.stack += (src.stack_len - dest.stack_len)
|
|
||||||
for i in range(src.stack_len):
|
|
||||||
dest.stack[-i] = src.stack[-i]
|
|
||||||
dest.stack_len = src.stack_len
|
|
||||||
# Copy sentence (i.e. the parse), up to and including word i.
|
|
||||||
if src.i > dest.i:
|
|
||||||
memcpy(dest.sent, src.sent, sizeof(TokenC) * (src.i+1))
|
|
||||||
else:
|
|
||||||
memcpy(dest.sent, src.sent, sizeof(TokenC) * (dest.i+1))
|
|
||||||
dest.i = src.i
|
|
||||||
# Copy assigned entities --- also pointer arithmetic
|
|
||||||
dest.ent += (src.ents_len - dest.ents_len)
|
|
||||||
for i in range(src.ents_len):
|
|
||||||
dest.ent[-i] = src.ent[-i]
|
|
||||||
dest.ents_len = src.ents_len
|
|
||||||
|
|
||||||
|
|
||||||
# From https://en.wikipedia.org/wiki/Hamming_weight
|
|
||||||
cdef inline uint32_t _popcount(uint32_t x) nogil:
|
|
||||||
"""Find number of non-zero bits."""
|
|
||||||
cdef uint32_t count = 0
|
|
||||||
while x != 0:
|
|
||||||
x &= x - 1
|
|
||||||
count += 1
|
|
||||||
return count
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
|
|
||||||
cdef uint32_t i
|
|
||||||
for i in range(32):
|
|
||||||
if bits & (1 << i):
|
|
||||||
n -= 1
|
|
||||||
if n < 1:
|
|
||||||
return i
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
cdef const TokenC* _new_get_left(const State* s, const TokenC* target, int idx) nogil:
|
|
||||||
if idx < 1:
|
|
||||||
return NULL
|
|
||||||
cdef const TokenC* ptr = s.sent
|
|
||||||
while ptr < target:
|
|
||||||
# If this head is still to the right of us, we can skip to it
|
|
||||||
# No token that's between this token and this head could be our
|
|
||||||
# child.
|
|
||||||
if (ptr.head >= 1) and (ptr + ptr.head) < target:
|
|
||||||
ptr += ptr.head
|
|
||||||
|
|
||||||
elif ptr + ptr.head == target:
|
|
||||||
idx -= 1
|
|
||||||
if idx == 0:
|
|
||||||
return ptr
|
|
||||||
ptr += 1
|
|
||||||
else:
|
|
||||||
ptr += 1
|
|
||||||
return NULL
|
|
||||||
|
|
||||||
|
|
||||||
cdef const TokenC* _new_get_right(const State* s, const TokenC* target, int idx) nogil:
|
|
||||||
if idx < 1:
|
|
||||||
return NULL
|
|
||||||
cdef const TokenC* ptr = s.sent + (s.sent_len - 1)
|
|
||||||
while ptr > target:
|
|
||||||
# If this head is still to the right of us, we can skip to it
|
|
||||||
# No token that's between this token and this head could be our
|
|
||||||
# child.
|
|
||||||
if (ptr.head < 0) and ((ptr + ptr.head) > target):
|
|
||||||
ptr += ptr.head
|
|
||||||
elif ptr + ptr.head == target:
|
|
||||||
idx -= 1
|
|
||||||
if idx == 0:
|
|
||||||
return ptr
|
|
||||||
ptr -= 1
|
|
||||||
else:
|
|
||||||
ptr -= 1
|
|
||||||
return NULL
|
|
Loading…
Reference in New Issue