* Add test for Issue #203: noun chunks should be flat, but sometimes are nested

2016-01-16 17:41:25 +01:00 · 2016-01-16 17:41:25 +01:00 · 4a16dbfeca
parent 995b2d18fd
commit 4a16dbfeca
1 changed files with 31 additions and 0 deletions
--- a/spacy/tests/tokens/test_noun_chunks.py
+++ b/spacy/tests/tokens/test_noun_chunks.py
@ -0,0 +1,31 @@
 import numpy as np
 from spacy.attrs import HEAD, DEP
 from spacy.symbols import nsubj, dobj, punct, amod, nmod, conj, cc, root
 from spacy.en import English
 def test_not_nested():
    nlp = English(parser=False)
    sent = u'''Peter has chronic command and control issues'''.strip()
    tokens = nlp(sent)
    tokens.from_array(
        [HEAD, DEP],
        np.asarray(
            [
                [1, nsubj],
                [0, root],
                [4, amod],
                [3, nmod],
                [-1, cc],
                [-2, conj],
                [-5, dobj]
            ], dtype='int32'))
    word_occurred = {}
    for chunk in tokens.noun_chunks:
        for word in chunk:
            word_occurred.setdefault(word.text, 0)
            word_occurred[word.text] += 1
    for word, freq in word_occurred.items():
        assert freq == 1, (word, [chunk.text for chunk in tokens.noun_chunks])