mirror of https://github.com/explosion/spaCy.git
* Add Span class to Python API
This commit is contained in:
parent
b9b695fb1b
commit
64db61bff1
|
@ -54,6 +54,12 @@ cdef class Tokens:
|
||||||
cdef int set_parse(self, const TokenC* parsed, dict label_ids) except -1
|
cdef int set_parse(self, const TokenC* parsed, dict label_ids) except -1
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Span:
|
||||||
|
cdef Tokens _seq
|
||||||
|
cdef public int start
|
||||||
|
cdef public int end
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
cdef Vocab vocab
|
cdef Vocab vocab
|
||||||
cdef unicode _string
|
cdef unicode _string
|
||||||
|
|
|
@ -237,24 +237,15 @@ cdef class Tokens:
|
||||||
"""This is really only a place-holder for a proper solution."""
|
"""This is really only a place-holder for a proper solution."""
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:])
|
cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:])
|
||||||
#cdef attr_t period = self.vocab.strings['.']
|
|
||||||
#cdef attr_t question = self.vocab.strings['?']
|
|
||||||
#cdef attr_t exclamation = self.vocab.strings['!']
|
|
||||||
spans = []
|
|
||||||
start = None
|
start = None
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
if start is None:
|
if start is None:
|
||||||
start = i
|
start = i
|
||||||
if self.data[i].sent_end:
|
if self.data[i].sent_end:
|
||||||
spans.append((start, i+1))
|
yield Span(self, start, i+1)
|
||||||
start = None
|
start = None
|
||||||
#if self.data[i].lex.orth == period or self.data[i].lex.orth == exclamation or \
|
|
||||||
# self.data[i].lex.orth == question:
|
|
||||||
# spans.append((start, i+1))
|
|
||||||
# start = None
|
|
||||||
if start is not None:
|
if start is not None:
|
||||||
spans.append((start, self.length))
|
yield Span(self, start, self.length)
|
||||||
return spans
|
|
||||||
|
|
||||||
cdef int set_parse(self, const TokenC* parsed, dict label_ids) except -1:
|
cdef int set_parse(self, const TokenC* parsed, dict label_ids) except -1:
|
||||||
self._py_tokens = [None] * self.length
|
self._py_tokens = [None] * self.length
|
||||||
|
@ -267,6 +258,26 @@ cdef class Tokens:
|
||||||
self._dep_strings = tuple(dep_strings)
|
self._dep_strings = tuple(dep_strings)
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Span:
|
||||||
|
"""A slice from a Tokens object."""
|
||||||
|
def __cinit__(self, Tokens tokens, int start, int end):
|
||||||
|
self._seq = tokens
|
||||||
|
self.start = start
|
||||||
|
self.end = end
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
if self.end < self.start:
|
||||||
|
return 0
|
||||||
|
return self.end - self.start
|
||||||
|
|
||||||
|
def __getitem__(self, int i):
|
||||||
|
return self._seq[self.start + i]
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
for i in range(self.start, self.end):
|
||||||
|
yield self._seq[i]
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
||||||
via Tokens.__getitem__ and Tokens.__iter__.
|
via Tokens.__getitem__ and Tokens.__iter__.
|
||||||
|
|
Loading…
Reference in New Issue