spaCy/spacy/tokens.pyx

from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as inc


from spacy.lexeme cimport Lexeme
from spacy.spacy cimport StringHash


cdef class Tokens:
    def __cinit__(self, Language lang):
        self.lang = lang
        self.vctr = new vector[Lexeme_addr]()
        self.length = 0

    def __dealloc__(self):
        del self.vctr

    def __iter__(self):
        cdef vector[Lexeme_addr].iterator it = self.vctr[0].begin()
        while it != self.vctr[0].end():
            yield deref(it)
            inc(it)

    def __getitem__(self, size_t idx):
        return self.vctr[0].at(idx)

    def __len__(self):
        return self.length

    cpdef int append(self, Lexeme_addr token):
        self.vctr[0].push_back(token)
        self.length += 1

    cpdef int extend(self, Tokens other) except -1:
        cdef Lexeme_addr el
        for el in other:
            self.append(el)

    cpdef object group_by(self, size_t attr):
        '''Group tokens that share the property attr into Tokens instances, and
        return a list of them. Returns a tuple of three lists:
        
        (string names, hashes, tokens)

        The lists are aligned, so the ith entry in string names is the string
        that the ith entry in hashes unhashes to, which the Tokens instance
        is grouped by.
        
        You can then use count_by or group_by on the Tokens
        for further processing. Calling group_by and then asking the length
        of the Tokens objects is equivalent to count_by, but somewhat slower.
        '''
        # Implementation here is working around some of the constraints in
        # Cython about what type of thing can go in what type of container.
        # Long story short, it's pretty hard to get a Python object like
        # Tokens into a vector or array. If we really need this to run faster,
        # we can be tricky and get the Python list access out of the loop. What
        # we'd do is store pointers to the underlying vectors.
        # So far, speed isn't mattering here.
        cdef dict indices = {}
        cdef list groups = []
        cdef list names = []
        cdef list hashes = []

        cdef StringHash key
        cdef Lexeme_addr t
        for t in self.vctr[0]:
            key = self.lang.attr_of(t, attr)
            if key in indices:
                groups[indices[key]].append(t)
            else:
                indices[key] = len(groups)
                groups.append(Tokens(self.lang))
                names.append(self.lang.unhash(key))
                hashes.append(key)
                groups[-1].append(t)
        return names, hashes, groups

    cpdef dict count_by(self, size_t attr):
        counts = {}
        cdef Lexeme_addr t
        cdef StringHash key
        for t in self.vctr[0]:
            #key = attr_of(t, attr)
            key = 0
            if key not in counts:
                counts[key] = 0
            counts[key] += 1
        return counts
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 10:47:21 +00:00			`from cython.operator cimport dereference as deref`
			`from cython.operator cimport preincrement as inc`


* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 14:58:48 +00:00			`from spacy.lexeme cimport Lexeme`
			`from spacy.spacy cimport StringHash`


* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 10:47:21 +00:00			`cdef class Tokens:`
			`def __cinit__(self, Language lang):`
			`self.lang = lang`
			`self.vctr = new vector[Lexeme_addr]()`
			`self.length = 0`

			`def __dealloc__(self):`
			`del self.vctr`

			`def __iter__(self):`
			`cdef vector[Lexeme_addr].iterator it = self.vctr[0].begin()`
			`while it != self.vctr[0].end():`
			`yield deref(it)`
			`inc(it)`

			`def __getitem__(self, size_t idx):`
			`return self.vctr[0].at(idx)`

			`def __len__(self):`
			`return self.length`

			`cpdef int append(self, Lexeme_addr token):`
			`self.vctr[0].push_back(token)`
			`self.length += 1`

			`cpdef int extend(self, Tokens other) except -1:`
			`cdef Lexeme_addr el`
			`for el in other:`
			`self.append(el)`

* Broken version being refactored for docs 2014-08-20 11:39:39 +00:00			`cpdef object group_by(self, size_t attr):`
* Upd from spacy 2014-07-23 16:35:18 +00:00			`'''Group tokens that share the property attr into Tokens instances, and`
			`return a list of them. Returns a tuple of three lists:`

			`(string names, hashes, tokens)`

			`The lists are aligned, so the ith entry in string names is the string`
			`that the ith entry in hashes unhashes to, which the Tokens instance`
			`is grouped by.`

			`You can then use count_by or group_by on the Tokens`
			`for further processing. Calling group_by and then asking the length`
			`of the Tokens objects is equivalent to count_by, but somewhat slower.`
			`'''`
			`# Implementation here is working around some of the constraints in`
			`# Cython about what type of thing can go in what type of container.`
			`# Long story short, it's pretty hard to get a Python object like`
			`# Tokens into a vector or array. If we really need this to run faster,`
			`# we can be tricky and get the Python list access out of the loop. What`
			`# we'd do is store pointers to the underlying vectors.`
			`# So far, speed isn't mattering here.`
Group-by seems to be working 2014-07-07 18:27:02 +00:00			`cdef dict indices = {}`
* Upd from spacy 2014-07-23 16:35:18 +00:00			`cdef list groups = []`
			`cdef list names = []`
			`cdef list hashes = []`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 10:47:21 +00:00
Group-by seems to be working 2014-07-07 18:27:02 +00:00			`cdef StringHash key`
			`cdef Lexeme_addr t`
			`for t in self.vctr[0]:`
* Struggling with arbitrary attr access... 2014-08-21 21:49:14 +00:00			`key = self.lang.attr_of(t, attr)`
Group-by seems to be working 2014-07-07 18:27:02 +00:00			`if key in indices:`
* Upd from spacy 2014-07-23 16:35:18 +00:00			`groups[indices[key]].append(t)`
Group-by seems to be working 2014-07-07 18:27:02 +00:00			`else:`
* Upd from spacy 2014-07-23 16:35:18 +00:00			`indices[key] = len(groups)`
			`groups.append(Tokens(self.lang))`
			`names.append(self.lang.unhash(key))`
			`hashes.append(key)`
			`groups[-1].append(t)`
			`return names, hashes, groups`
Group-by seems to be working 2014-07-07 18:27:02 +00:00
* Broken version being refactored for docs 2014-08-20 11:39:39 +00:00			`cpdef dict count_by(self, size_t attr):`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 10:47:21 +00:00			`counts = {}`
			`cdef Lexeme_addr t`
* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 14:58:48 +00:00			`cdef StringHash key`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 10:47:21 +00:00			`for t in self.vctr[0]:`
* Broken version being refactored for docs 2014-08-20 11:39:39 +00:00			`#key = attr_of(t, attr)`
			`key = 0`
* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 14:58:48 +00:00			`if key not in counts:`
			`counts[key] = 0`
			`counts[key] += 1`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 10:47:21 +00:00			`return counts`