spaCy/spacy/tokens.pxd

from libc.stdint cimport uint32_t

from numpy cimport ndarray
cimport numpy

from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t

from .typedefs cimport flags_t, attr_id_t, attr_t
from .parts_of_speech cimport univ_pos_t
from .structs cimport Morphology, TokenC, LexemeC
from .vocab cimport Vocab
from .strings cimport StringStore


ctypedef const LexemeC* const_Lexeme_ptr
ctypedef TokenC* TokenC_ptr

ctypedef fused LexemeOrToken:
    const_Lexeme_ptr
    TokenC_ptr


cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil
cdef attr_t get_token_attr(const TokenC* lex, attr_id_t feat_name) nogil

cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
    return lexeme.flags & (1 << flag_id)


cdef class Tokens:
    cdef Pool mem
    cdef Vocab vocab
    
    cdef TokenC* data
    

    cdef list _py_tokens
    cdef unicode _string
    cdef tuple _tag_strings
    cdef tuple _dep_strings

    cdef public bint is_tagged
    cdef public bint is_parsed

    cdef int length
    cdef int max_length

    cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1

    cpdef long[:,:] to_array(self, object features)


cdef class Token:
    cdef Vocab vocab
    cdef unicode _string

    cdef const TokenC* c
    cdef readonly int i
    cdef int array_len

    
    cdef list _py
    cdef tuple _tag_strings
    cdef tuple _dep_strings

    @staticmethod
    cdef inline Token cinit(Vocab vocab, unicode string,
                            const TokenC* token, int offset, int array_len,
                            list py_tokens, tuple tag_strings, tuple dep_strings):
        if offset < 0 or offset >= array_len:

            msg = "Attempt to access token at %d, max length %d"
            raise IndexError(msg % (offset, array_len))
        if py_tokens[offset] is not None:
            return py_tokens[offset]

        cdef Token self = Token.__new__(Token, vocab, string)

        self.c = token
        self.i = offset
        self.array_len = array_len

        self._py = py_tokens
        self._tag_strings = tag_strings
        self._dep_strings = dep_strings
        py_tokens[offset] = self
        return self
* Work on greedy parser 2014-12-16 11:44:43 +00:00			`from libc.stdint cimport uint32_t`

* Messily hook up vector in tokens 2015-01-19 08:59:55 +00:00			`from numpy cimport ndarray`
* Fix POS and dependency label tag names. Add parse and string navigation functions. 2015-01-24 06:29:04 +00:00			`cimport numpy`
* Work on get_array method of Tokens 2014-12-02 12:48:05 +00:00
* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding. 2014-10-22 14:57:59 +00:00			`from cymem.cymem cimport Pool`
* Work on train 2014-12-21 20:25:43 +00:00			`from thinc.typedefs cimport atom_t`
* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding. 2014-10-22 14:57:59 +00:00
* Move POS tag definitions to parts_of_speech.pxd 2015-01-25 05:31:07 +00:00			`from .typedefs cimport flags_t, attr_id_t, attr_t`
			`from .parts_of_speech cimport univ_pos_t`
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-11 23:26:22 +00:00			`from .structs cimport Morphology, TokenC, LexemeC`
* Work on train 2014-12-21 20:25:43 +00:00			`from .vocab cimport Vocab`
* Update tokens.pyx as part of reorg 2014-12-19 20:03:26 +00:00			`from .strings cimport StringStore`
* Introduce a TokenC struct, to handle token indices, pos tags and sense tags 2014-12-05 04:56:14 +00:00

* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-11 23:26:22 +00:00			`ctypedef const LexemeC* const_Lexeme_ptr`
* Use fused type in Tokens.push_back, simplifying the use of the cache 2014-12-09 05:50:01 +00:00			`ctypedef TokenC* TokenC_ptr`

			`ctypedef fused LexemeOrToken:`
			`const_Lexeme_ptr`
			`TokenC_ptr`


* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-11 23:26:22 +00:00			`cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil`
* Tmp 2014-12-24 06:42:00 +00:00			`cdef attr_t get_token_attr(const TokenC* lex, attr_id_t feat_name) nogil`

* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-11 23:26:22 +00:00			`cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:`
* Tmp 2014-12-24 06:42:00 +00:00			`return lexeme.flags & (1 << flag_id)`


* Upd Tokens to use vector, with bounds checking. 2014-09-15 01:22:40 +00:00			`cdef class Tokens:`
* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding. 2014-10-22 14:57:59 +00:00			`cdef Pool mem`
* Work on train 2014-12-21 20:25:43 +00:00			`cdef Vocab vocab`
* Fix POS and dependency label tag names. Add parse and string navigation functions. 2015-01-24 06:29:04 +00:00
* Introduce a TokenC struct, to handle token indices, pos tags and sense tags 2014-12-05 04:56:14 +00:00			`cdef TokenC* data`
* Fix POS and dependency label tag names. Add parse and string navigation functions. 2015-01-24 06:29:04 +00:00

* Fix Issue #14: Improve parsing API 2015-01-30 07:04:41 +00:00			`cdef list _py_tokens`
* Add _string attr to Tokens object 2015-01-21 07:57:09 +00:00			`cdef unicode _string`
* Large refactor of Token objects, making them much thinner. This is to support fast parse-tree navigation. 2015-01-31 02:42:58 +00:00			`cdef tuple _tag_strings`
			`cdef tuple _dep_strings`
* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding. 2014-10-22 14:57:59 +00:00
* Add error if try to access head and not is_parsed 2015-01-25 04:33:54 +00:00			`cdef public bint is_tagged`
			`cdef public bint is_parsed`

* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding. 2014-10-22 14:57:59 +00:00			`cdef int length`
			`cdef int max_length`
* Have tokens track tuples that record the start offset and pos tag as well as a lexeme pointer 2014-10-14 04:21:03 +00:00
* Use fused type in Tokens.push_back, simplifying the use of the cache 2014-12-09 05:50:01 +00:00			`cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1`
* Large refactor, particularly to Python API 2014-10-23 13:59:17 +00:00
* Make PyPy work 2015-01-05 06:54:13 +00:00			`cpdef long[:,:] to_array(self, object features)`
* Work on get_array method of Tokens 2014-12-02 12:48:05 +00:00
* Large refactor, particularly to Python API 2014-10-23 13:59:17 +00:00
			`cdef class Token:`
* Large refactor of Token objects, making them much thinner. This is to support fast parse-tree navigation. 2015-01-31 02:42:58 +00:00			`cdef Vocab vocab`
			`cdef unicode _string`

			`cdef const TokenC* c`
* Bug fixes to parse navigation 2015-01-31 05:37:13 +00:00			`cdef readonly int i`
* Large refactor of Token objects, making them much thinner. This is to support fast parse-tree navigation. 2015-01-31 02:42:58 +00:00			`cdef int array_len`


			`cdef list _py`
			`cdef tuple _tag_strings`
			`cdef tuple _dep_strings`

			`@staticmethod`
* Try not holding a reference to Pool, since that seems to confuse the GC 2015-01-31 11:10:22 +00:00			`cdef inline Token cinit(Vocab vocab, unicode string,`
* Large refactor of Token objects, making them much thinner. This is to support fast parse-tree navigation. 2015-01-31 02:42:58 +00:00			`const TokenC* token, int offset, int array_len,`
			`list py_tokens, tuple tag_strings, tuple dep_strings):`
* Give better error on out-of-bounds array access 2015-02-07 17:59:12 +00:00			`if offset < 0 or offset >= array_len:`

			`msg = "Attempt to access token at %d, max length %d"`
			`raise IndexError(msg % (offset, array_len))`
* Large refactor of Token objects, making them much thinner. This is to support fast parse-tree navigation. 2015-01-31 02:42:58 +00:00			`if py_tokens[offset] is not None:`
			`return py_tokens[offset]`

* Try not holding a reference to Pool, since that seems to confuse the GC 2015-01-31 11:10:22 +00:00			`cdef Token self = Token.__new__(Token, vocab, string)`
* Large refactor of Token objects, making them much thinner. This is to support fast parse-tree navigation. 2015-01-31 02:42:58 +00:00
			`self.c = token`
			`self.i = offset`
			`self.array_len = array_len`

			`self._py = py_tokens`
			`self._tag_strings = tag_strings`
			`self._dep_strings = dep_strings`
			`py_tokens[offset] = self`
			`return self`