* Add Span class to Python API

This commit is contained in:
Matthew Honnibal 2015-03-13 19:21:16 -04:00
parent b9b695fb1b
commit 64db61bff1
2 changed files with 28 additions and 11 deletions

View File

@ -54,6 +54,12 @@ cdef class Tokens:
cdef int set_parse(self, const TokenC* parsed, dict label_ids) except -1
cdef class Span:
cdef Tokens _seq
cdef public int start
cdef public int end
cdef class Token:
cdef Vocab vocab
cdef unicode _string

View File

@ -237,24 +237,15 @@ cdef class Tokens:
"""This is really only a place-holder for a proper solution."""
cdef int i
cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:])
#cdef attr_t period = self.vocab.strings['.']
#cdef attr_t question = self.vocab.strings['?']
#cdef attr_t exclamation = self.vocab.strings['!']
spans = []
start = None
for i in range(self.length):
if start is None:
start = i
if self.data[i].sent_end:
spans.append((start, i+1))
yield Span(self, start, i+1)
start = None
#if self.data[i].lex.orth == period or self.data[i].lex.orth == exclamation or \
# self.data[i].lex.orth == question:
# spans.append((start, i+1))
# start = None
if start is not None:
spans.append((start, self.length))
return spans
yield Span(self, start, self.length)
cdef int set_parse(self, const TokenC* parsed, dict label_ids) except -1:
self._py_tokens = [None] * self.length
@ -267,6 +258,26 @@ cdef class Tokens:
self._dep_strings = tuple(dep_strings)
cdef class Span:
"""A slice from a Tokens object."""
def __cinit__(self, Tokens tokens, int start, int end):
self._seq = tokens
self.start = start
self.end = end
def __len__(self):
if self.end < self.start:
return 0
return self.end - self.start
def __getitem__(self, int i):
return self._seq[self.start + i]
def __iter__(self):
for i in range(self.start, self.end):
yield self._seq[i]
cdef class Token:
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
via Tokens.__getitem__ and Tokens.__iter__.