* Move Span class to own file

This commit is contained in:
Matthew Honnibal 2015-03-26 03:18:34 +01:00
parent f02c39dfaf
commit 6f47a667cf
1 changed files with 7 additions and 35 deletions

View File

@ -10,6 +10,7 @@ from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLU
from .typedefs cimport POS, LEMMA
from .parts_of_speech import UNIV_POS_NAMES
from .lexeme cimport check_flag
from .spans import Span
from unidecode import unidecode
@ -132,7 +133,7 @@ cdef class Tokens:
cdef int i
cdef const TokenC* token
cdef int start = -1
cdef object label = None
cdef int label = 0
for i in range(self.length):
token = &self.data[i]
if token.ent_iob == 1:
@ -140,15 +141,15 @@ cdef class Tokens:
pass
elif token.ent_iob == 2:
if start != -1:
yield (start, i, label)
yield Span(self, start, i, label=label)
start = -1
label = None
label = 0
elif token.ent_iob == 3:
start = i
label = self.vocab.strings[token.ent_type]
label = token.ent_type
if start != -1:
yield (start, self.length, label)
yield Span(self, start, self.length, label=label)
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
if self.length == self.max_length:
self._realloc(self.length * 2)
@ -253,35 +254,6 @@ cdef class Tokens:
self.data[i] = parsed[i]
cdef class Span:
"""A slice from a Tokens object."""
def __cinit__(self, Tokens tokens, int start, int end):
self._seq = tokens
self.start = start
self.end = end
def __richcmp__(self, Span other, int op):
# Eq
if op in (1, 2, 5):
if self._seq is other._seq and \
self.start == other.start and \
self.end == other.end:
return True
return False
def __len__(self):
if self.end < self.start:
return 0
return self.end - self.start
def __getitem__(self, int i):
return self._seq[self.start + i]
def __iter__(self):
for i in range(self.start, self.end):
yield self._seq[i]
cdef class Token:
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
via Tokens.__getitem__ and Tokens.__iter__.