* Add test for Issue #203: noun chunks should be flat, but sometimes are nested

This commit is contained in:
Matthew Honnibal 2016-01-16 17:41:25 +01:00
parent 995b2d18fd
commit 4a16dbfeca
1 changed files with 31 additions and 0 deletions

View File

@ -0,0 +1,31 @@
import numpy as np
from spacy.attrs import HEAD, DEP
from spacy.symbols import nsubj, dobj, punct, amod, nmod, conj, cc, root
from spacy.en import English
def test_not_nested():
nlp = English(parser=False)
sent = u'''Peter has chronic command and control issues'''.strip()
tokens = nlp(sent)
tokens.from_array(
[HEAD, DEP],
np.asarray(
[
[1, nsubj],
[0, root],
[4, amod],
[3, nmod],
[-1, cc],
[-2, conj],
[-5, dobj]
], dtype='int32'))
word_occurred = {}
for chunk in tokens.noun_chunks:
for word in chunk:
word_occurred.setdefault(word.text, 0)
word_occurred[word.text] += 1
for word, freq in word_occurred.items():
assert freq == 1, (word, [chunk.text for chunk in tokens.noun_chunks])