mirror of https://github.com/explosion/spaCy.git
56 lines
1.8 KiB
Python
56 lines
1.8 KiB
Python
|
|
||
|
|
||
|
def ancestors(word, heads):
|
||
|
# returns all words going from the word up the path to the root
|
||
|
# the path to root cannot be longer than the number of words in the sentence
|
||
|
# this function ends after at most len(heads) steps
|
||
|
# because it would otherwise loop indefinitely on cycles
|
||
|
head = word
|
||
|
cnt = 0
|
||
|
while heads[head] != head and cnt < len(heads):
|
||
|
head = heads[head]
|
||
|
cnt += 1
|
||
|
yield head
|
||
|
if head == None:
|
||
|
break
|
||
|
|
||
|
|
||
|
def contains_cycle(heads):
|
||
|
# in an acyclic tree, the path from each word following
|
||
|
# the head relation upwards always ends at the root node
|
||
|
for word in range(len(heads)):
|
||
|
seen = set([word])
|
||
|
for ancestor in ancestors(word,heads):
|
||
|
if ancestor in seen:
|
||
|
return seen
|
||
|
seen.add(ancestor)
|
||
|
return None
|
||
|
|
||
|
|
||
|
def is_non_projective_arc(word, heads):
|
||
|
# definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
|
||
|
# if there is a word k, h < k < d such that h is not
|
||
|
# an ancestor of k. Same for h -> d, h > d
|
||
|
head = heads[word]
|
||
|
if head == word: # root arcs cannot be non-projective
|
||
|
return False
|
||
|
elif head == None: # unattached tokens cannot be non-projective
|
||
|
return False
|
||
|
|
||
|
start, end = (head+1, word) if head < word else (word+1, head)
|
||
|
for k in range(start,end):
|
||
|
for ancestor in ancestors(k,heads):
|
||
|
if ancestor == None: # for unattached tokens/subtrees
|
||
|
break
|
||
|
elif ancestor == head: # normal case: k dominated by h
|
||
|
break
|
||
|
else: # head not in ancestors: d -> h is non-projective
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
|
||
|
def is_non_projective_tree(heads):
|
||
|
# a tree is non-projective if at least one arc is non-projective
|
||
|
return any( is_non_projective_arc(word,heads) for word in range(len(heads)) )
|
||
|
|