2014-07-07 10:47:21 +00:00
|
|
|
from cython.operator cimport dereference as deref
|
|
|
|
from cython.operator cimport preincrement as inc
|
|
|
|
|
|
|
|
|
2014-07-07 14:58:48 +00:00
|
|
|
from spacy.lexeme cimport Lexeme
|
2014-07-07 18:27:02 +00:00
|
|
|
from spacy.lexeme cimport attr_of, norm_of, shape_of
|
2014-07-07 14:58:48 +00:00
|
|
|
from spacy.spacy cimport StringHash
|
|
|
|
|
|
|
|
|
2014-07-07 10:47:21 +00:00
|
|
|
cdef class Tokens:
|
|
|
|
def __cinit__(self, Language lang):
|
|
|
|
self.lang = lang
|
|
|
|
self.vctr = new vector[Lexeme_addr]()
|
|
|
|
self.length = 0
|
|
|
|
|
|
|
|
def __dealloc__(self):
|
|
|
|
del self.vctr
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
cdef vector[Lexeme_addr].iterator it = self.vctr[0].begin()
|
|
|
|
while it != self.vctr[0].end():
|
|
|
|
yield deref(it)
|
|
|
|
inc(it)
|
|
|
|
|
|
|
|
def __getitem__(self, size_t idx):
|
|
|
|
return self.vctr[0].at(idx)
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
return self.length
|
|
|
|
|
|
|
|
cpdef int append(self, Lexeme_addr token):
|
|
|
|
self.vctr[0].push_back(token)
|
|
|
|
self.length += 1
|
|
|
|
|
|
|
|
cpdef int extend(self, Tokens other) except -1:
|
|
|
|
cdef Lexeme_addr el
|
|
|
|
for el in other:
|
|
|
|
self.append(el)
|
|
|
|
|
2014-07-07 18:27:02 +00:00
|
|
|
cpdef list group_by(self, StringAttr attr):
|
|
|
|
cdef dict indices = {}
|
|
|
|
cdef vector[vector[Lexeme_addr]] groups = vector[vector[Lexeme_addr]]()
|
2014-07-07 10:47:21 +00:00
|
|
|
|
2014-07-07 18:27:02 +00:00
|
|
|
cdef StringHash key
|
|
|
|
cdef Lexeme_addr t
|
|
|
|
for t in self.vctr[0]:
|
|
|
|
key = attr_of(t, attr)
|
|
|
|
if key in indices:
|
|
|
|
groups[indices[key]].push_back(t)
|
|
|
|
else:
|
|
|
|
indices[key] = groups.size()
|
|
|
|
groups.push_back(vector[Lexeme_addr]())
|
|
|
|
groups.back().push_back(t)
|
|
|
|
return groups
|
|
|
|
|
|
|
|
cpdef dict count_by(self, StringAttr attr):
|
2014-07-07 10:47:21 +00:00
|
|
|
counts = {}
|
|
|
|
cdef Lexeme_addr t
|
2014-07-07 14:58:48 +00:00
|
|
|
cdef StringHash key
|
2014-07-07 10:47:21 +00:00
|
|
|
for t in self.vctr[0]:
|
2014-07-07 18:27:02 +00:00
|
|
|
key = attr_of(t, attr)
|
2014-07-07 14:58:48 +00:00
|
|
|
if key not in counts:
|
|
|
|
counts[key] = 0
|
|
|
|
counts[key] += 1
|
2014-07-07 10:47:21 +00:00
|
|
|
return counts
|