spaCy/spacy/nonproj.py

56 lines
1.8 KiB
Python
Raw Normal View History

def ancestors(word, heads):
# returns all words going from the word up the path to the root
# the path to root cannot be longer than the number of words in the sentence
# this function ends after at most len(heads) steps
# because it would otherwise loop indefinitely on cycles
head = word
cnt = 0
while heads[head] != head and cnt < len(heads):
head = heads[head]
cnt += 1
yield head
if head == None:
break
def contains_cycle(heads):
# in an acyclic tree, the path from each word following
# the head relation upwards always ends at the root node
for word in range(len(heads)):
seen = set([word])
for ancestor in ancestors(word,heads):
if ancestor in seen:
return seen
seen.add(ancestor)
return None
def is_non_projective_arc(word, heads):
# definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
# if there is a word k, h < k < d such that h is not
# an ancestor of k. Same for h -> d, h > d
head = heads[word]
if head == word: # root arcs cannot be non-projective
return False
elif head == None: # unattached tokens cannot be non-projective
return False
start, end = (head+1, word) if head < word else (word+1, head)
for k in range(start,end):
for ancestor in ancestors(k,heads):
if ancestor == None: # for unattached tokens/subtrees
break
elif ancestor == head: # normal case: k dominated by h
break
else: # head not in ancestors: d -> h is non-projective
return True
return False
def is_non_projective_tree(heads):
# a tree is non-projective if at least one arc is non-projective
return any( is_non_projective_arc(word,heads) for word in range(len(heads)) )