* Add test for Issue #203: noun chunks should be flat, but sometimes are nested

2016-01-16 17:41:25 +01:00 · 2016-01-16 17:41:25 +01:00 · 4a16dbfeca
parent 995b2d18fd
commit 4a16dbfeca
1 changed files with 31 additions and 0 deletions
--- a/spacy/tests/tokens/test_noun_chunks.py
+++ b/spacy/tests/tokens/test_noun_chunks.py
@ -0,0 +1,31 @@
+import numpy as np
+
+from spacy.attrs import HEAD, DEP
+from spacy.symbols import nsubj, dobj, punct, amod, nmod, conj, cc, root
+from spacy.en import English
+
+
+
+def test_not_nested():
+    nlp = English(parser=False)
+    sent = u'''Peter has chronic command and control issues'''.strip()
+    tokens = nlp(sent)
+    tokens.from_array(
+        [HEAD, DEP],
+        np.asarray(
+            [
+                [1, nsubj],
+                [0, root],
+                [4, amod],
+                [3, nmod],
+                [-1, cc],
+                [-2, conj],
+                [-5, dobj]
+            ], dtype='int32'))
+    word_occurred = {}
+    for chunk in tokens.noun_chunks:
+        for word in chunk:
+            word_occurred.setdefault(word.text, 0)
+            word_occurred[word.text] += 1
+    for word, freq in word_occurred.items():
+        assert freq == 1, (word, [chunk.text for chunk in tokens.noun_chunks])