spaCy/spacy/tokens.pyx

# cython: embedsignature=True
from cython.view cimport array as cvarray

from preshed.maps cimport PreshMap
from preshed.counter cimport PreshCounter

from .vocab cimport EMPTY_LEXEME
from .typedefs cimport attr_id_t, attr_t
from .typedefs cimport LEMMA
from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .typedefs cimport POS, LEMMA

from unidecode import unidecode

cimport numpy
import numpy

cimport cython


DEF PADDING = 5


cdef int bounds_check(int i, int length, int padding) except -1:
    if (i + padding) < 0:
        raise IndexError
    if (i - padding) >= length:
        raise IndexError


cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
    if feat_name == LEMMA:
        return token.lemma
    elif feat_name == POS:
        return token.pos
    else:
        return get_lex_attr(token.lex, feat_name)


cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
    if feat_name < (sizeof(flags_t) * 8):
        return check_flag(lex, feat_name)
    elif feat_name == ID:
        return lex.id
    elif feat_name == SIC:
        return lex.sic
    elif feat_name == NORM1:
        return lex.norm1
    elif feat_name == NORM2:
        return lex.norm2
    elif feat_name == SHAPE:
        return lex.shape
    elif feat_name == PREFIX:
        return lex.prefix
    elif feat_name == SUFFIX:
        return lex.suffix
    elif feat_name == LENGTH:
        return lex.length
    elif feat_name == CLUSTER:
        return lex.cluster
    else:
        return 0


cdef class Tokens:
    """Access and set annotations onto some text.
    """
    def __init__(self, Vocab vocab, unicode string):
        self.vocab = vocab
        self._string = string
        string_length = len(string)
        if string_length >= 3:
            size = int(string_length / 3.0)
        else:
            size = 5
        self.mem = Pool()
        # Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
        # However, we need to remember the true starting places, so that we can
        # realloc.
        data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
        cdef int i
        for i in range(size + (PADDING*2)):
            data_start[i].lex = &EMPTY_LEXEME
        self.data = data_start + PADDING
        self.max_length = size
        self.length = 0

    def sentences(self):
        cdef int i
        sentences = []
        cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:])
        cdef attr_t period = self.vocab.strings['.']
        cdef attr_t question = self.vocab.strings['?']
        cdef attr_t exclamation = self.vocab.strings['!']
        spans = []
        start = None
        for i in range(self.length):
            if start is None:
                start = i
            if self.data[i].lex.sic == period or self.data[i].lex.sic == exclamation or \
              self.data[i].lex.sic == question:
                spans.append((start, i+1))
                start = None
        if start is not None:
            spans.append((start, self.length))
        return spans

    def __getitem__(self, i):
        """Retrieve a token.
        
        Returns:
            token (Token):
        """
        if i < 0:
            i = self.length - i
        bounds_check(i, self.length, PADDING)
        return Token(self, i)

    def __iter__(self):
        """Iterate over the tokens.

        Yields:
            token (Token):
        """
        for i in range(self.length):
            yield self[i]

    def __len__(self):
        return self.length

    def __unicode__(self):
        cdef const TokenC* last = &self.data[self.length - 1]
        return self._string[:last.idx + last.lex.length]

    def __str__(self):
        return unidecode(unicode(self))

    cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
        if self.length == self.max_length:
            self._realloc(self.length * 2)
        cdef TokenC* t = &self.data[self.length]
        if LexemeOrToken is TokenC_ptr:
            t[0] = lex_or_tok[0]
        else:
            t.lex = lex_or_tok
        t.idx = idx
        self.length += 1
        return idx + t.lex.length

    @cython.boundscheck(False)
    cpdef long[:,:] to_array(self, object attr_ids):
        """Given a list of M attribute IDs, export the tokens to a numpy ndarray
        of shape N*M, where N is the length of the sentence.

        Arguments:
            attr_ids (list[int]): A list of attribute ID ints.

        Returns:
            feat_array (numpy.ndarray[long, ndim=2]):
              A feature matrix, with one row per word, and one column per attribute
              indicated in the input attr_ids.
        """
        cdef int i, j
        cdef attr_id_t feature
        cdef long[:,:] output = cvarray(shape=(self.length, len(attr_ids)),
                                        itemsize=sizeof(long), format="l")
        for i in range(self.length):
            for j, feature in enumerate(attr_ids):
                output[i, j] = get_token_attr(&self.data[i], feature)
        return output

    def count_by(self, attr_id_t attr_id, exclude=None):
        """Produce a dict of {attribute (int): count (ints)} frequencies, keyed
        by the values of the given attribute ID.

          >>> from spacy.en import English, attrs
          >>> nlp = English()
          >>> tokens = nlp(u'apple apple orange banana')
          >>> tokens.count_by(attrs.SIC)
          {12800L: 1, 11880L: 2, 7561L: 1}
          >>> tokens.to_array([attrs.SIC])
          array([[11880],
                 [11880],
                 [ 7561],
                 [12800]])
        """
        cdef int i
        cdef attr_t attr
        cdef size_t count

        cdef PreshCounter counts = PreshCounter(2 ** 8)
        for i in range(self.length):
            if exclude is not None and exclude(self[i]):
                continue
            attr = get_token_attr(&self.data[i], attr_id)
            counts.inc(attr, 1)
        return dict(counts)

    def _realloc(self, new_size):
        self.max_length = new_size
        n = new_size + (PADDING * 2)
        # What we're storing is a "padded" array. We've jumped forward PADDING
        # places, and are storing the pointer to that. This way, we can access
        # words out-of-bounds, and get out-of-bounds markers.
        # Now that we want to realloc, we need the address of the true start,
        # so we jump the pointer back PADDING places.
        cdef TokenC* data_start = self.data - PADDING
        data_start = <TokenC*>self.mem.realloc(data_start, n * sizeof(TokenC))
        self.data = data_start + PADDING
        cdef int i
        for i in range(self.length, self.max_length + PADDING):
            self.data[i].lex = &EMPTY_LEXEME


@cython.freelist(64)
cdef class Token:
    """An individual token."""
    def __init__(self, Tokens tokens, int i):
        self._seq = tokens
        self.i = i
        cdef const TokenC* t = &tokens.data[i]
        self.idx = t.idx
        self.cluster = t.lex.cluster
        self.length = t.lex.length
        self.sic = t.lex.sic
        self.norm1 = t.lex.norm1
        self.norm2 = t.lex.norm2
        self.shape = t.lex.shape
        self.prefix = t.lex.prefix
        self.suffix = t.lex.suffix
        self.prob = t.lex.prob
        self.sentiment = t.lex.sentiment
        self.flags = t.lex.flags
        self.lemma = t.lemma
        self.tag = t.tag
        self.dep = t.dep
        self.repvec = numpy.asarray(<float[:300,]> t.lex.repvec)

    def __unicode__(self):
        cdef const TokenC* t = &self._seq.data[self.i]
        cdef int end_idx = t.idx + t.lex.length
        if self.i + 1 == self._seq.length:
            return self.string
        if end_idx == t[1].idx:
            return self.string
        else:
            return self.string + ' '

    def __len__(self):
        """The number of unicode code-points in the original string.

        Returns:
            length (int):
        """
        return self._seq.data[self.i].lex.length

    def check_flag(self, attr_id_t flag):
        return self.flags & (1 << flag)

    def is_pos(self, univ_tag_t pos):
        return self.tag == pos

    property head:
        """The token predicted by the parser to be the head of the current token."""
        def __get__(self):
            cdef const TokenC* t = &self._seq.data[self.i]
            return Token(self._seq, self.i + t.head)

    property string:
        """The unicode string of the word, with no whitespace padding."""
        def __get__(self):
            cdef const TokenC* t = &self._seq.data[self.i]
            if t.lex.sic == 0:
                return ''
            cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic]
            return py_ustr

    property sic_:
        def __get__(self):
            return self._seq.vocab.strings[self.sic]

    property norm1_:
        def __get__(self):
            return self._seq.vocab.strings[self.norm1]

    property norm2_:
        def __get__(self):
            return self._seq.vocab.strings[self.norm2]

    property shape_:
        def __get__(self):
            return self._seq.vocab.strings[self.shape]

    property prefix_:
        def __get__(self):
            return self._seq.vocab.strings[self.prefix]

    property suffix_:
        def __get__(self):
            return self._seq.vocab.strings[self.suffix]

    property lemma_:
        def __get__(self):
            cdef const TokenC* t = &self._seq.data[self.i]
            if t.lemma == 0:
                return self.string
            cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
            return py_ustr

    property tag_:
        def __get__(self):
            return self._seq.tag_names[self.tag]

    property dep_:
        def __get__(self):
            return self._seq.dep_names[self.dep]
* Embedsignature in tokens.pyx 2014-12-30 10:22:00 +00:00			`# cython: embedsignature=True`
* Make PyPy work 2015-01-05 06:54:13 +00:00			`from cython.view cimport array as cvarray`
* Upd docstrings 2014-12-27 07:45:16 +00:00
* Add as_array and count_by method 2014-12-04 09:46:55 +00:00			`from preshed.maps cimport PreshMap`
			`from preshed.counter cimport PreshCounter`

* Tmp 2014-12-24 06:42:00 +00:00			`from .vocab cimport EMPTY_LEXEME`
* Work on train 2014-12-21 20:25:43 +00:00			`from .typedefs cimport attr_id_t, attr_t`
			`from .typedefs cimport LEMMA`
* Tmp. Refactoring, introducing a Lexeme PyObject. 2015-01-12 00:23:44 +00:00			`from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER`
* Tmp 2014-12-24 06:42:00 +00:00			`from .typedefs cimport POS, LEMMA`

* Various tweaks to Tokens class 2015-01-21 15:05:37 +00:00			`from unidecode import unidecode`

* Messily hook up vector in tokens 2015-01-19 08:59:55 +00:00			`cimport numpy`
			`import numpy`

* Improve array features in tokens 2014-10-22 01:55:42 +00:00			`cimport cython`
* Hack Tokens to work without tagger.pyx 2014-12-03 00:05:15 +00:00
* Move EnglishTokens stuff to Tokens 2014-09-14 23:31:44 +00:00
* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding. 2014-10-22 14:57:59 +00:00			`DEF PADDING = 5`

* Generalize tagger code, in preparation for NER and supersense tagging. 2014-11-04 16:42:14 +00:00
* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding. 2014-10-22 14:57:59 +00:00			`cdef int bounds_check(int i, int length, int padding) except -1:`
			`if (i + padding) < 0:`
			`raise IndexError`
			`if (i - padding) >= length:`
			`raise IndexError`

* Move EnglishTokens stuff to Tokens 2014-09-14 23:31:44 +00:00
* Tmp 2014-12-24 06:42:00 +00:00			`cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:`
			`if feat_name == LEMMA:`
			`return token.lemma`
			`elif feat_name == POS:`
			`return token.pos`
			`else:`
			`return get_lex_attr(token.lex, feat_name)`


* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-11 23:26:22 +00:00			`cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:`
* Tmp 2014-12-24 06:42:00 +00:00			`if feat_name < (sizeof(flags_t) * 8):`
			`return check_flag(lex, feat_name)`
			`elif feat_name == ID:`
			`return lex.id`
			`elif feat_name == SIC:`
			`return lex.sic`
* Tmp. Refactoring, introducing a Lexeme PyObject. 2015-01-12 00:23:44 +00:00			`elif feat_name == NORM1:`
			`return lex.norm1`
			`elif feat_name == NORM2:`
			`return lex.norm2`
* Tmp 2014-12-24 06:42:00 +00:00			`elif feat_name == SHAPE:`
			`return lex.shape`
			`elif feat_name == PREFIX:`
			`return lex.prefix`
			`elif feat_name == SUFFIX:`
			`return lex.suffix`
			`elif feat_name == LENGTH:`
			`return lex.length`
			`elif feat_name == CLUSTER:`
			`return lex.cluster`
			`else:`
			`return 0`


* Refactoring to use Tokens object 2014-09-10 16:11:13 +00:00			`cdef class Tokens:`
* Upd docstrings 2014-12-27 07:45:16 +00:00			`"""Access and set annotations onto some text.`
* Refactoring to use Tokens object 2014-09-10 16:11:13 +00:00			`"""`
* Bug fixes to sentences method, and improved vector transport for tokens 2015-01-21 07:56:32 +00:00			`def __init__(self, Vocab vocab, unicode string):`
* Work on train 2014-12-21 20:25:43 +00:00			`self.vocab = vocab`
* Bug fixes to sentences method, and improved vector transport for tokens 2015-01-21 07:56:32 +00:00			`self._string = string`
			`string_length = len(string)`
* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding. 2014-10-22 14:57:59 +00:00			`if string_length >= 3:`
			`size = int(string_length / 3.0)`
			`else:`
			`size = 5`
			`self.mem = Pool()`
			`# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds`
			`# However, we need to remember the true starting places, so that we can`
			`# realloc.`
* Remove need for confusing _data pointer to be stored on Tokens 2014-12-05 05:31:30 +00:00			`data_start = <TokenC>self.mem.alloc(size + (PADDING2), sizeof(TokenC))`
* Fix padding on tokens 2014-10-22 17:01:17 +00:00			`cdef int i`
			`for i in range(size + (PADDING*2)):`
* Fix EMPTY_TOKEN 2014-12-07 11:07:41 +00:00			`data_start[i].lex = &EMPTY_LEXEME`
* Remove need for confusing _data pointer to be stored on Tokens 2014-12-05 05:31:30 +00:00			`self.data = data_start + PADDING`
* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding. 2014-10-22 14:57:59 +00:00			`self.max_length = size`
			`self.length = 0`
* Work on efficiency 2014-10-14 07:22:41 +00:00
* Fix negative indices in tokens 2015-01-19 14:16:29 +00:00			`def sentences(self):`
			`cdef int i`
			`sentences = []`
* Bug fixes to sentences method, and improved vector transport for tokens 2015-01-21 07:56:32 +00:00			`cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:])`
* Fix negative indices in tokens 2015-01-19 14:16:29 +00:00			`cdef attr_t period = self.vocab.strings['.']`
			`cdef attr_t question = self.vocab.strings['?']`
			`cdef attr_t exclamation = self.vocab.strings['!']`
* Have SBD return start/end indices 2015-01-22 11:24:44 +00:00			`spans = []`
			`start = None`
* Fix negative indices in tokens 2015-01-19 14:16:29 +00:00			`for i in range(self.length):`
* Have SBD return start/end indices 2015-01-22 11:24:44 +00:00			`if start is None:`
			`start = i`
* Fix negative indices in tokens 2015-01-19 14:16:29 +00:00			`if self.data[i].lex.sic == period or self.data[i].lex.sic == exclamation or \`
			`self.data[i].lex.sic == question:`
* Have SBD return start/end indices 2015-01-22 11:24:44 +00:00			`spans.append((start, i+1))`
			`start = None`
			`if start is not None:`
			`spans.append((start, self.length))`
			`return spans`
* Fix negative indices in tokens 2015-01-19 14:16:29 +00:00
* Switch to returning a Tokens object 2014-09-11 19:37:32 +00:00			`def __getitem__(self, i):`
* Upd docstrings 2014-12-27 07:45:16 +00:00			`"""Retrieve a token.`

			`Returns:`
			`token (Token):`
			`"""`
* Fix negative indices in tokens 2015-01-19 14:16:29 +00:00			`if i < 0:`
			`i = self.length - i`
* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding. 2014-10-22 14:57:59 +00:00			`bounds_check(i, self.length, PADDING)`
* Tmp. Working on refactor. Compiles, must hook up lexical feats. 2015-01-13 13:03:48 +00:00			`return Token(self, i)`
* Switch to returning a Tokens object 2014-09-11 19:37:32 +00:00
* Add __iter__ method to tokens 2014-11-03 14:07:08 +00:00			`def __iter__(self):`
* Upd docstrings 2014-12-27 07:45:16 +00:00			`"""Iterate over the tokens.`

			`Yields:`
			`token (Token):`
			`"""`
* Add __iter__ method to tokens 2014-11-03 14:07:08 +00:00			`for i in range(self.length):`
			`yield self[i]`

* Switch to returning a Tokens object 2014-09-11 19:37:32 +00:00			`def __len__(self):`
* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding. 2014-10-22 14:57:59 +00:00			`return self.length`
* Switch to returning a Tokens object 2014-09-11 19:37:32 +00:00
* Bug fixes to sentences method, and improved vector transport for tokens 2015-01-21 07:56:32 +00:00			`def __unicode__(self):`
			`cdef const TokenC* last = &self.data[self.length - 1]`
			`return self._string[:last.idx + last.lex.length]`

* Various tweaks to Tokens class 2015-01-21 15:05:37 +00:00			`def __str__(self):`
			`return unidecode(unicode(self))`

* Use fused type in Tokens.push_back, simplifying the use of the cache 2014-12-09 05:50:01 +00:00			`cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:`
* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding. 2014-10-22 14:57:59 +00:00			`if self.length == self.max_length:`
			`self._realloc(self.length * 2)`
* Introduce a TokenC struct, to handle token indices, pos tags and sense tags 2014-12-05 04:56:14 +00:00			`cdef TokenC* t = &self.data[self.length]`
* Use fused type in Tokens.push_back, simplifying the use of the cache 2014-12-09 05:50:01 +00:00			`if LexemeOrToken is TokenC_ptr:`
			`t[0] = lex_or_tok[0]`
			`else:`
			`t.lex = lex_or_tok`
* Tmp 2014-12-24 06:42:00 +00:00			`t.idx = idx`
* Remove the feature array stuff from Tokens class, and replace vector with array-based implementation, with padding. 2014-10-22 14:57:59 +00:00			`self.length += 1`
* Use fused type in Tokens.push_back, simplifying the use of the cache 2014-12-09 05:50:01 +00:00			`return idx + t.lex.length`
* Have tokens track tuples that record the start offset and pos tag as well as a lexeme pointer 2014-10-14 04:21:03 +00:00
* Add as_array and count_by method 2014-12-04 09:46:55 +00:00			`@cython.boundscheck(False)`
* Make PyPy work 2015-01-05 06:54:13 +00:00			`cpdef long[:,:] to_array(self, object attr_ids):`
* Work on docstrings 2014-12-27 10:46:04 +00:00			`"""Given a list of M attribute IDs, export the tokens to a numpy ndarray`
			`of shape N*M, where N is the length of the sentence.`

			`Arguments:`
			`attr_ids (list[int]): A list of attribute ID ints.`

			`Returns:`
* Tests passing after refactor. API has obvious warts, particularly in Token and Lexeme 2015-01-14 13:33:16 +00:00			`feat_array (numpy.ndarray[long, ndim=2]):`
			`A feature matrix, with one row per word, and one column per attribute`
			`indicated in the input attr_ids.`
* Work on docstrings 2014-12-27 10:46:04 +00:00			`"""`
* Work on get_array method of Tokens 2014-12-02 12:48:05 +00:00			`cdef int i, j`
* Add as_array and count_by method 2014-12-04 09:46:55 +00:00			`cdef attr_id_t feature`
* Make PyPy work 2015-01-05 06:54:13 +00:00			`cdef long[:,:] output = cvarray(shape=(self.length, len(attr_ids)),`
			`itemsize=sizeof(long), format="l")`
* Work on get_array method of Tokens 2014-12-02 12:48:05 +00:00			`for i in range(self.length):`
* Add as_array and count_by method 2014-12-04 09:46:55 +00:00			`for j, feature in enumerate(attr_ids):`
* Tmp 2014-12-24 06:42:00 +00:00			`output[i, j] = get_token_attr(&self.data[i], feature)`
* Work on get_array method of Tokens 2014-12-02 12:48:05 +00:00			`return output`

* Various tweaks to Tokens class 2015-01-21 15:05:37 +00:00			`def count_by(self, attr_id_t attr_id, exclude=None):`
* Work on docstrings 2014-12-27 10:46:04 +00:00			`"""Produce a dict of {attribute (int): count (ints)} frequencies, keyed`
			`by the values of the given attribute ID.`

			`>>> from spacy.en import English, attrs`
			`>>> nlp = English()`
			`>>> tokens = nlp(u'apple apple orange banana')`
			`>>> tokens.count_by(attrs.SIC)`
			`{12800L: 1, 11880L: 2, 7561L: 1}`
			`>>> tokens.to_array([attrs.SIC])`
			`array([[11880],`
			`[11880],`
			`[ 7561],`
			`[12800]])`
			`"""`
* Add as_array and count_by method 2014-12-04 09:46:55 +00:00			`cdef int i`
			`cdef attr_t attr`
			`cdef size_t count`

			`cdef PreshCounter counts = PreshCounter(2 ** 8)`
			`for i in range(self.length):`
* Various tweaks to Tokens class 2015-01-21 15:05:37 +00:00			`if exclude is not None and exclude(self[i]):`
			`continue`
* Tmp 2014-12-24 06:42:00 +00:00			`attr = get_token_attr(&self.data[i], attr_id)`
* Add as_array and count_by method 2014-12-04 09:46:55 +00:00			`counts.inc(attr, 1)`
			`return dict(counts)`

* Large refactor, particularly to Python API 2014-10-23 13:59:17 +00:00			`def _realloc(self, new_size):`
			`self.max_length = new_size`
			`n = new_size + (PADDING * 2)`
* Fix EMPTY_TOKEN 2014-12-07 11:07:41 +00:00			`# What we're storing is a "padded" array. We've jumped forward PADDING`
			`# places, and are storing the pointer to that. This way, we can access`
			`# words out-of-bounds, and get out-of-bounds markers.`
			`# Now that we want to realloc, we need the address of the true start,`
			`# so we jump the pointer back PADDING places.`
* Remove need for confusing _data pointer to be stored on Tokens 2014-12-05 05:31:30 +00:00			`cdef TokenC* data_start = self.data - PADDING`
			`data_start = <TokenC>self.mem.realloc(data_start, n sizeof(TokenC))`
			`self.data = data_start + PADDING`
* Introduce a TokenC struct, to handle token indices, pos tags and sense tags 2014-12-05 04:56:14 +00:00			`cdef int i`
* Large refactor, particularly to Python API 2014-10-23 13:59:17 +00:00			`for i in range(self.length, self.max_length + PADDING):`
* Fix EMPTY_TOKEN 2014-12-07 11:07:41 +00:00			`self.data[i].lex = &EMPTY_LEXEME`
* Move EnglishTokens stuff to Tokens 2014-09-14 23:31:44 +00:00

* Tmp. Working on refactor. Compiles, must hook up lexical feats. 2015-01-13 13:03:48 +00:00			`@cython.freelist(64)`
* Large refactor, particularly to Python API 2014-10-23 13:59:17 +00:00			`cdef class Token:`
* Various tweaks to Tokens class 2015-01-21 15:05:37 +00:00			`"""An individual token."""`
* Tmp. Working on refactor. Compiles, must hook up lexical feats. 2015-01-13 13:03:48 +00:00			`def __init__(self, Tokens tokens, int i):`
			`self._seq = tokens`
			`self.i = i`
* Revise interface to Token. Strings now have attribute names like norm1_ 2015-01-14 16:51:47 +00:00			`cdef const TokenC* t = &tokens.data[i]`
			`self.idx = t.idx`
			`self.cluster = t.lex.cluster`
			`self.length = t.lex.length`
			`self.sic = t.lex.sic`
			`self.norm1 = t.lex.norm1`
			`self.norm2 = t.lex.norm2`
			`self.shape = t.lex.shape`
			`self.prefix = t.lex.prefix`
			`self.suffix = t.lex.suffix`
			`self.prob = t.lex.prob`
			`self.sentiment = t.lex.sentiment`
			`self.flags = t.lex.flags`
			`self.lemma = t.lemma`
* Work on word vectors, and other stuff 2015-01-17 05:21:17 +00:00			`self.tag = t.tag`
			`self.dep = t.dep`
* Various tweaks to Tokens class 2015-01-21 15:05:37 +00:00			`self.repvec = numpy.asarray(<float[:300,]> t.lex.repvec)`
* Messily hook up vector in tokens 2015-01-19 08:59:55 +00:00
* Tmp. Working on refactor. Compiles, must hook up lexical feats. 2015-01-13 13:03:48 +00:00			`def __unicode__(self):`
			`cdef const TokenC* t = &self._seq.data[self.i]`
			`cdef int end_idx = t.idx + t.lex.length`
			`if self.i + 1 == self._seq.length:`
			`return self.string`
			`if end_idx == t[1].idx:`
			`return self.string`
			`else:`
			`return self.string + ' '`
* Tmp 2014-12-24 06:42:00 +00:00
			`def __len__(self):`
* Upd docstrings 2014-12-27 07:45:16 +00:00			`"""The number of unicode code-points in the original string.`

			`Returns:`
			`length (int):`
			`"""`
* Tmp. Working on refactor. Compiles, must hook up lexical feats. 2015-01-13 13:03:48 +00:00			`return self._seq.data[self.i].lex.length`

* Work on word vectors, and other stuff 2015-01-17 05:21:17 +00:00			`def check_flag(self, attr_id_t flag):`
* Various tweaks to Tokens class 2015-01-21 15:05:37 +00:00			`return self.flags & (1 << flag)`
* Work on word vectors, and other stuff 2015-01-17 05:21:17 +00:00
			`def is_pos(self, univ_tag_t pos):`
* Bug fixes to sentences method, and improved vector transport for tokens 2015-01-21 07:56:32 +00:00			`return self.tag == pos`
* Work on word vectors, and other stuff 2015-01-17 05:21:17 +00:00
* Revise interface to Token. Strings now have attribute names like norm1_ 2015-01-14 16:51:47 +00:00			`property head:`
			`"""The token predicted by the parser to be the head of the current token."""`
* Tmp. Working on refactor. Compiles, must hook up lexical feats. 2015-01-13 13:03:48 +00:00			`def __get__(self):`
* Revise interface to Token. Strings now have attribute names like norm1_ 2015-01-14 16:51:47 +00:00			`cdef const TokenC* t = &self._seq.data[self.i]`
			`return Token(self._seq, self.i + t.head)`
* Tmp. Working on refactor. Compiles, must hook up lexical feats. 2015-01-13 13:03:48 +00:00
			`property string:`
			`"""The unicode string of the word, with no whitespace padding."""`
			`def __get__(self):`
			`cdef const TokenC* t = &self._seq.data[self.i]`
			`if t.lex.sic == 0:`
			`return ''`
			`cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic]`
			`return py_ustr`

* Revise interface to Token. Strings now have attribute names like norm1_ 2015-01-14 16:51:47 +00:00			`property sic_:`
			`def __get__(self):`
			`return self._seq.vocab.strings[self.sic]`

			`property norm1_:`
			`def __get__(self):`
			`return self._seq.vocab.strings[self.norm1]`

			`property norm2_:`
			`def __get__(self):`
			`return self._seq.vocab.strings[self.norm2]`

			`property shape_:`
			`def __get__(self):`
			`return self._seq.vocab.strings[self.shape]`

			`property prefix_:`
			`def __get__(self):`
			`return self._seq.vocab.strings[self.prefix]`

			`property suffix_:`
			`def __get__(self):`
			`return self._seq.vocab.strings[self.suffix]`

			`property lemma_:`
* Tmp. Working on refactor. Compiles, must hook up lexical feats. 2015-01-13 13:03:48 +00:00			`def __get__(self):`
			`cdef const TokenC* t = &self._seq.data[self.i]`
			`if t.lemma == 0:`
			`return self.string`
			`cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]`
			`return py_ustr`

* Work on word vectors, and other stuff 2015-01-17 05:21:17 +00:00			`property tag_:`
* Tmp. Working on refactor. Compiles, must hook up lexical feats. 2015-01-13 13:03:48 +00:00			`def __get__(self):`
* Work on word vectors, and other stuff 2015-01-17 05:21:17 +00:00			`return self._seq.tag_names[self.tag]`
* Tmp. Working on refactor. Compiles, must hook up lexical feats. 2015-01-13 13:03:48 +00:00
* Work on word vectors, and other stuff 2015-01-17 05:21:17 +00:00			`property dep_:`
* Tmp. Working on refactor. Compiles, must hook up lexical feats. 2015-01-13 13:03:48 +00:00			`def __get__(self):`
* Work on word vectors, and other stuff 2015-01-17 05:21:17 +00:00			`return self._seq.dep_names[self.dep]`