* Rearrange code in tokens.pyx

This commit is contained in:
Matthew Honnibal 2015-04-13 05:41:25 +02:00
parent 5ce51ce8d6
commit fbd48c571d
1 changed files with 54 additions and 36 deletions

View File

@ -136,27 +136,45 @@ cdef class Tokens:
cdef const TokenC* last = &self.data[self.length - 1] cdef const TokenC* last = &self.data[self.length - 1]
return self._string[:last.idx + last.lex.length] return self._string[:last.idx + last.lex.length]
property ents: @property
def __get__(self): def ents(self):
cdef int i """Yields named-entity Span objects."""
cdef const TokenC* token cdef int i
cdef int start = -1 cdef const TokenC* token
cdef int label = 0 cdef int start = -1
for i in range(self.length): cdef int label = 0
token = &self.data[i] for i in range(self.length):
if token.ent_iob == 1: token = &self.data[i]
assert start != -1 if token.ent_iob == 1:
pass assert start != -1
elif token.ent_iob == 2: pass
if start != -1: elif token.ent_iob == 2:
yield Span(self, start, i, label=label) if start != -1:
start = -1 yield Span(self, start, i, label=label)
label = 0 start = -1
elif token.ent_iob == 3: label = 0
start = i elif token.ent_iob == 3:
label = token.ent_type start = i
if start != -1: label = token.ent_type
yield Span(self, start, self.length, label=label) if start != -1:
yield Span(self, start, self.length, label=label)
@property
def sents(self):
"""Yield a list of sentence Span objects, calculated from the dependency
parse.
"""
cdef int i
cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:])
start = None
for i in range(self.length):
if start is None:
start = i
if self.data[i].sent_end:
yield Span(self, start, i+1)
start = None
if start is not None:
yield Span(self, start, self.length)
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1: cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
if self.length == self.max_length: if self.length == self.max_length:
@ -238,21 +256,6 @@ cdef class Tokens:
for i in range(self.length, self.max_length + PADDING): for i in range(self.length, self.max_length + PADDING):
self.data[i].lex = &EMPTY_LEXEME self.data[i].lex = &EMPTY_LEXEME
@property
def sents(self):
"""This is really only a place-holder for a proper solution."""
cdef int i
cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:])
start = None
for i in range(self.length):
if start is None:
start = i
if self.data[i].sent_end:
yield Span(self, start, i+1)
start = None
if start is not None:
yield Span(self, start, self.length)
cdef int set_parse(self, const TokenC* parsed) except -1: cdef int set_parse(self, const TokenC* parsed) except -1:
# TODO: This method is fairly misleading atm. It's used by GreedyParser # TODO: This method is fairly misleading atm. It's used by GreedyParser
# to actually apply the parse calculated. Need to rethink this. # to actually apply the parse calculated. Need to rethink this.
@ -263,6 +266,8 @@ cdef class Tokens:
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma, def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
unicode ent_type): unicode ent_type):
"""Merge a multi-word expression into a single token. Currently
experimental; API is likely to change."""
cdef int i cdef int i
cdef int start = -1 cdef int start = -1
cdef int end = -1 cdef int end = -1
@ -526,10 +531,23 @@ cdef class Token:
self.c + self.c.head, self.i + self.c.head, self.array_len, self.c + self.c.head, self.i + self.c.head, self.array_len,
self._seq) self._seq)
property ent_type:
def __get__(self):
return self.c.ent_type
property ent_iob:
def __get__(self):
return self.c.ent_iob
property ent_type_: property ent_type_:
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.ent_type] return self.vocab.strings[self.c.ent_type]
property ent_iob_:
def __get__(self):
iob_strings = ('', 'I', 'O', 'B')
return iob_strings[self.c.ent_iob]
property whitespace_: property whitespace_:
def __get__(self): def __get__(self):
return self.string[self.c.lex.length:] return self.string[self.c.lex.length:]