spaCy/tests/test_parse_navigate.py

72 lines
2.1 KiB
Python

from __future__ import unicode_literals
from os import path
import codecs
from spacy.en import English
import pytest
@pytest.fixture
def sun_text():
with codecs.open(path.join(path.dirname(__file__), 'sun.txt'), 'r', 'utf8') as file_:
text = file_.read()
return text
@pytest.fixture
def nlp():
return English()
def test_consistency(nlp, sun_text):
tokens = nlp(sun_text)
for head in tokens:
for child in head.lefts:
assert child.head is head
for child in head.rights:
assert child.head is head
def test_child_consistency(nlp, sun_text):
tokens = nlp(sun_text)
lefts = {}
rights = {}
for head in tokens:
assert head.i not in lefts
lefts[head.i] = set()
for left in head.lefts:
lefts[head.i].add(left.i)
assert head.i not in rights
rights[head.i] = set()
for right in head.rights:
rights[head.i].add(right.i)
for head in tokens:
assert head.n_rights == len(rights[head.i])
assert head.n_lefts == len(lefts[head.i])
for child in tokens:
if child.i < child.head.i:
assert child.i in lefts[child.head.i]
assert child.i not in rights[child.head.i]
lefts[child.head.i].remove(child.i)
elif child.i > child.head.i:
assert child.i in rights[child.head.i]
assert child.i not in lefts[child.head.i]
rights[child.head.i].remove(child.i)
for head_index, children in lefts.items():
assert not children
for head_index, children in rights.items():
assert not children
def test_edges(nlp):
sun_text = u"Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium."
tokens = nlp(sun_text)
for token in tokens:
subtree = list(token.subtree)
debug = '\t'.join((token.orth_, token.left_edge.orth_, subtree[0].orth_))
assert token.left_edge == subtree[0], debug
debug = '\t'.join((token.orth_, token.right_edge.orth_, subtree[-1].orth_, token.right_edge.head.orth_))
assert token.right_edge == subtree[-1], debug