spaCy/spacy/syntax/stateclass.pyx

# coding: utf-8
# cython: infer_types=True
from __future__ import unicode_literals

from libc.string cimport memcpy, memset
from libc.stdint cimport uint32_t, uint64_t

from ..vocab cimport EMPTY_LEXEME
from ..structs cimport Entity
from ..lexeme cimport Lexeme
from ..symbols cimport punct
from ..attrs cimport IS_SPACE
from ..attrs cimport attr_id_t
from ..tokens.token cimport Token


cdef class StateClass:
    def __init__(self, int length):
        cdef Pool mem = Pool()
        self.mem = mem

    def __dealloc__(self):
        del self.c

    @property
    def stack(self):
        return {self.S(i) for i in range(self.c._s_i)}

    @property
    def queue(self):
        return {self.B(i) for i in range(self.c.buffer_length())}

    @property
    def token_vector_lenth(self):
        return self.doc.tensor.shape[1]

    def py_is_final(self):
        return self.c.is_final()

    def print_state(self, words):
        words = list(words) + ['_']
        top = words[self.S(0)] + '_%d' % self.S_(0).head
        second = words[self.S(1)] + '_%d' % self.S_(1).head
        third = words[self.S(2)] + '_%d' % self.S_(2).head
        n0 = words[self.B(0)]
        n1 = words[self.B(1)]
        return ' '.join((third, second, top, '|', n0, n1))

    def nr_context_tokens(self, int nF, int nB, int nS, int nL, int nR):
        return 4

    def set_context_tokens(self, int[:] output, nF=1, nB=0, nS=2,
            nL=2, nR=2):
        output[0] = self.B(0)
        output[1] = self.B(1)
        output[2] = self.S(0)
        output[3] = self.S(1)
        #output[4] = self.L(self.S(0), 1)
        #output[5] = self.L(self.S(0), 2)
        #output[6] = self.R(self.S(0), 1)
        #output[7] = self.R(self.S(0), 2)
        #output[7] = self.L(self.S(1), 1)
        #output[8] = self.L(self.S(1), 2)
        #output[9] = self.R(self.S(1), 1)
        #output[10] = self.R(self.S(1), 2)

    def set_attributes(self, uint64_t[:, :] vals, int[:] tokens, int[:] names):
        cdef int i, j, tok_i
        for i in range(tokens.shape[0]):
            tok_i = tokens[i]
            if tok_i >= 0:
                token = &self.c._sent[tok_i]
                for j in range(names.shape[0]):
                    vals[i, j] = Token.get_struct_attr(token, <attr_id_t>names[j])
            else:
                vals[i] = 0

    def set_token_vectors(self, float[:, :] tokvecs,
            float[:, :] all_tokvecs, int[:] indices):
        for i in range(indices.shape[0]):
            if indices[i] >= 0:
                tokvecs[i] = all_tokvecs[indices[i]]
            else:
                tokvecs[i] = 0
Tidy up and fix formatting and imports 2017-04-15 11:05:15 +00:00			`# coding: utf-8`
Data running through, likely errors in model 2017-05-06 12:22:20 +00:00			`# cython: infer_types=True`
Tidy up and fix formatting and imports 2017-04-15 11:05:15 +00:00			`from __future__ import unicode_literals`

* Add StateClass, to replace/refactor the mess in _state 2015-06-08 23:39:54 +00:00			`from libc.string cimport memcpy, memset`
Data running through, likely errors in model 2017-05-06 12:22:20 +00:00			`from libc.stdint cimport uint32_t, uint64_t`
Tidy up and fix formatting and imports 2017-04-15 11:05:15 +00:00
* Prepare to switch to using state class, instead of state struct 2015-06-09 19:20:14 +00:00			`from ..vocab cimport EMPTY_LEXEME`
* Greedy parsing working with new StateClass. Beam parsing broken 2015-06-10 02:20:23 +00:00			`from ..structs cimport Entity`
* Unwind limit to sentence boundary detection that prevents it from inserting boundaries on whitespace. Replace it with a check for whitespace in StateClass.fast_forward, so that whitespace is LeftArced when it's on the stack. This should prevent the previous problem of whitespace-only sentences. Should fix Issue #184, but may cause further problems. Needs testing. 2016-01-19 01:54:15 +00:00			`from ..lexeme cimport Lexeme`
			`from ..symbols cimport punct`
			`from ..attrs cimport IS_SPACE`
Data running through, likely errors in model 2017-05-06 12:22:20 +00:00			`from ..attrs cimport attr_id_t`
			`from ..tokens.token cimport Token`
* Add StateClass, to replace/refactor the mess in _state 2015-06-08 23:39:54 +00:00

			`cdef class StateClass:`
* Prepare to switch to using state class, instead of state struct 2015-06-09 19:20:14 +00:00			`def __init__(self, int length):`
			`cdef Pool mem = Pool()`
			`self.mem = mem`
* Continue proxying. Some problem currently 2016-02-01 01:22:21 +00:00
			`def __dealloc__(self):`
			`del self.c`

* Add stack and queue properties to stateclass, for python access 2015-08-08 21:32:42 +00:00			`@property`
			`def stack(self):`
different handling of space tokens space tokens are now always attached to the previous non-space token there are two exceptions: leading space tokens are attached to the first following non-space token in input that consists exclusively of space tokens, the last space token is the head of all others. 2016-04-13 13:28:28 +00:00			`return {self.S(i) for i in range(self.c._s_i)}`
* Add stack and queue properties to stateclass, for python access 2015-08-08 21:32:42 +00:00
			`@property`
			`def queue(self):`
Fix queue Python property in StateClass 2016-10-16 15:04:41 +00:00			`return {self.B(i) for i in range(self.c.buffer_length())}`
* Add stack and queue properties to stateclass, for python access 2015-08-08 21:32:42 +00:00
Data running through, likely errors in model 2017-05-06 12:22:20 +00:00			`@property`
			`def token_vector_lenth(self):`
			`return self.doc.tensor.shape[1]`

Gradients look correct 2017-05-06 14:47:15 +00:00			`def py_is_final(self):`
Data running through, likely errors in model 2017-05-06 12:22:20 +00:00			`return self.c.is_final()`

* Move StateClass into interface of transition functions 2015-06-09 23:35:28 +00:00			`def print_state(self, words):`
			`words = list(words) + ['_']`
* Add unshift action to StateClass, and track which moves have been shifted 2015-06-10 08:13:03 +00:00			`top = words[self.S(0)] + '_%d' % self.S_(0).head`
			`second = words[self.S(1)] + '_%d' % self.S_(1).head`
			`third = words[self.S(2)] + '_%d' % self.S_(2).head`
Tidy up and fix formatting and imports 2017-04-15 11:05:15 +00:00			`n0 = words[self.B(0)]`
			`n1 = words[self.B(1)]`
* Upd stateclass.print_state 2015-06-14 15:44:29 +00:00			`return ' '.join((third, second, top, '\|', n0, n1))`
Data running through, likely errors in model 2017-05-06 12:22:20 +00:00
			`def nr_context_tokens(self, int nF, int nB, int nS, int nL, int nR):`
Learns things 2017-05-06 16:24:38 +00:00			`return 4`
Data running through, likely errors in model 2017-05-06 12:22:20 +00:00
			`def set_context_tokens(self, int[:] output, nF=1, nB=0, nS=2,`
			`nL=2, nR=2):`
			`output[0] = self.B(0)`
Learns things 2017-05-06 15:37:36 +00:00			`output[1] = self.B(1)`
			`output[2] = self.S(0)`
			`output[3] = self.S(1)`
Learns things 2017-05-06 16:24:38 +00:00			`#output[4] = self.L(self.S(0), 1)`
			`#output[5] = self.L(self.S(0), 2)`
			`#output[6] = self.R(self.S(0), 1)`
			`#output[7] = self.R(self.S(0), 2)`
Gradients look correct 2017-05-06 14:47:15 +00:00			`#output[7] = self.L(self.S(1), 1)`
			`#output[8] = self.L(self.S(1), 2)`
			`#output[9] = self.R(self.S(1), 1)`
			`#output[10] = self.R(self.S(1), 2)`
Data running through, likely errors in model 2017-05-06 12:22:20 +00:00
			`def set_attributes(self, uint64_t[:, :] vals, int[:] tokens, int[:] names):`
			`cdef int i, j, tok_i`
			`for i in range(tokens.shape[0]):`
			`tok_i = tokens[i]`
Gradients look correct 2017-05-06 14:47:15 +00:00			`if tok_i >= 0:`
			`token = &self.c._sent[tok_i]`
			`for j in range(names.shape[0]):`
			`vals[i, j] = Token.get_struct_attr(token, <attr_id_t>names[j])`
			`else:`
			`vals[i] = 0`
Data running through, likely errors in model 2017-05-06 12:22:20 +00:00
			`def set_token_vectors(self, float[:, :] tokvecs,`
			`float[:, :] all_tokvecs, int[:] indices):`
			`for i in range(indices.shape[0]):`
Gradients look correct 2017-05-06 14:47:15 +00:00			`if indices[i] >= 0:`
			`tokvecs[i] = all_tokvecs[indices[i]]`
			`else:`
			`tokvecs[i] = 0`