From 4a16dbfeca6b1067041c15a3591986d7d61fac93 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 16 Jan 2016 17:41:25 +0100 Subject: [PATCH] * Add test for Issue #203: noun chunks should be flat, but sometimes are nested --- spacy/tests/tokens/test_noun_chunks.py | 31 ++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 spacy/tests/tokens/test_noun_chunks.py diff --git a/spacy/tests/tokens/test_noun_chunks.py b/spacy/tests/tokens/test_noun_chunks.py new file mode 100644 index 000000000..9f1111e8b --- /dev/null +++ b/spacy/tests/tokens/test_noun_chunks.py @@ -0,0 +1,31 @@ +import numpy as np + +from spacy.attrs import HEAD, DEP +from spacy.symbols import nsubj, dobj, punct, amod, nmod, conj, cc, root +from spacy.en import English + + + +def test_not_nested(): + nlp = English(parser=False) + sent = u'''Peter has chronic command and control issues'''.strip() + tokens = nlp(sent) + tokens.from_array( + [HEAD, DEP], + np.asarray( + [ + [1, nsubj], + [0, root], + [4, amod], + [3, nmod], + [-1, cc], + [-2, conj], + [-5, dobj] + ], dtype='int32')) + word_occurred = {} + for chunk in tokens.noun_chunks: + for word in chunk: + word_occurred.setdefault(word.text, 0) + word_occurred[word.text] += 1 + for word, freq in word_occurred.items(): + assert freq == 1, (word, [chunk.text for chunk in tokens.noun_chunks])