* Add noun_chunks iterator, and fix left/right child setting in Doc.merge

This commit is contained in:
Matthew Honnibal 2015-07-30 02:29:49 +02:00
parent d153f18969
commit 74d8cb3980
1 changed files with 49 additions and 40 deletions

View File

@ -11,10 +11,10 @@ from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..parts_of_speech import UNIV_POS_NAMES
from ..parts_of_speech cimport CONJ, PUNCT
from ..parts_of_speech cimport CONJ, PUNCT, NOUN
from ..lexeme cimport check_flag
from ..lexeme cimport get_attr as get_lex_attr
from .spans import Span
from .spans cimport Span
from .token cimport Token
from ..serialize.bits cimport BitArray
@ -154,6 +154,18 @@ cdef class Doc:
if start != -1:
yield Span(self, start, self.length, label=label)
@property
def noun_chunks(self):
"""Yield spans for base noun phrases."""
cdef const TokenC* word
labels = ['nsubj', 'nsubjpass', 'pcomp', 'pobj', 'conj']
np_deps = [self.vocab.strings[label] for label in labels]
np_label = self.vocab.strings['NP']
for i in range(self.length):
word = &self.data[i]
if word.pos == NOUN and word.dep in np_deps:
yield Span(self, word.l_edge, i+1, label=np_label)
@property
def sents(self):
"""
@ -297,20 +309,7 @@ cdef class Doc:
elif attr_id == ENT_TYPE:
for i in range(length):
tokens[i].ent_type = values[i]
cdef TokenC* head
cdef TokenC* child
# Set left edges
for i in range(length):
child = &tokens[i]
head = &tokens[i + child.head]
if child < head and child.l_edge < head.l_edge:
head.l_edge = child.l_edge
# Set right edges --- same as above, but iterate in reverse
for i in range(length-1, -1, -1):
child = &tokens[i]
head = &tokens[i + child.head]
if child > head and child.r_edge > head.r_edge:
head.r_edge = child.r_edge
set_children_from_heads(self.data, self.length)
return self
def to_bytes(self):
@ -354,9 +353,12 @@ cdef class Doc:
break
else:
return None
cdef unicode string = self.string
cdef Span span = self[start:end]
# Get LexemeC for newly merged token
new_orth = string[start_idx:end_idx]
new_orth = ''.join([t.string for t in span])
if span[-1].whitespace_:
new_orth = new_orth[:-1]
cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
# House the new merged token where it starts
cdef TokenC* token = &self.data[start]
@ -372,30 +374,16 @@ cdef class Doc:
else:
token.ent_iob = 3
token.ent_type = self.vocab.strings[ent_type]
# Fix dependencies
# Begin by setting all the head indices to absolute token positions
# This is easier to work with for now than the offsets
# Before thinking of something simpler, beware the case where a dependency
# bridges over the entity. Here the alignment of the tokens changes.
span_root = span.root.i
for i in range(self.length):
self.data[i].head += i
# Find the head of the merged token, and its dep relation
outer_heads = {}
for i in range(start, end):
head_idx = self.data[i].head
if head_idx == i or head_idx < start or head_idx >= end:
# Don't consider "heads" which are actually dominated by a word
# in the region we're merging
gp = head_idx
while self.data[gp].head != gp:
if start <= gp < end:
break
gp = self.data[gp].head
else:
# If we have multiple words attaching to the same head,
# but with different dep labels, we're preferring the last
# occurring dep label. Shrug. What else could we do, I guess?
outer_heads[head_idx] = self.data[i].dep
token.head, token.dep = max(outer_heads.items())
# Set the head of the merged token, and its dep relation, from the Span
token.head = self.data[span_root].head
token.dep = span.root.dep
# Adjust deps before shrinking tokens
# Tokens which point into the merged token should now point to it
# Subtract the offset from all tokens which point to >= end
@ -406,7 +394,6 @@ cdef class Doc:
self.data[i].head = start
elif head_idx >= end:
self.data[i].head -= offset
# TODO: Fix left and right deps
# Now compress the token array
for i in range(end, self.length):
self.data[i - offset] = self.data[i]
@ -417,6 +404,28 @@ cdef class Doc:
for i in range(self.length):
# ...And, set heads back to a relative position
self.data[i].head -= i
# Set the left/right children, left/right edges
set_children_from_heads(self.data, self.length)
# Clear the cached Python objects
self._py_tokens = [None] * self.length
# Return the merged Python object
return self[start]
cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
cdef TokenC* head
cdef TokenC* child
cdef int i
# Set left edges
for i in range(length):
child = &tokens[i]
head = &tokens[i + child.head]
if child < head and child.l_edge < head.l_edge:
head.l_edge = child.l_edge
# Set right edges --- same as above, but iterate in reverse
for i in range(length-1, -1, -1):
child = &tokens[i]
head = &tokens[i + child.head]
if child > head and child.r_edge > head.r_edge:
head.r_edge = child.r_edge