diff --git a/spacy/tests/parser/test_noun_chunks.py b/spacy/tests/parser/test_noun_chunks.py index c90ad41da..5e8c7659a 100644 --- a/spacy/tests/parser/test_noun_chunks.py +++ b/spacy/tests/parser/test_noun_chunks.py @@ -1,137 +1,76 @@ # coding: utf-8 from __future__ import unicode_literals +from ..util import get_doc + import pytest -import numpy - -from ...attrs import HEAD, DEP -@pytest.mark.models -class TestNounChunks: - @pytest.fixture(scope="class") - def ex1_en(self, EN): - example = EN.tokenizer.tokens_from_list('A base phrase should be recognized .'.split(' ')) - EN.tagger.tag_from_strings(example, 'DT NN NN MD VB VBN .'.split(' ')) - det,compound,nsubjpass,aux,auxpass,root,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubjpass','aux','auxpass','root','punct'] ) - example.from_array([HEAD, DEP], - numpy.asarray( - [ - [2, det], - [1, compound], - [3, nsubjpass], - [2, aux], - [1, auxpass], - [0, root], - [-1, punct] - ], dtype='int32')) - return example +def test_parser_noun_chunks_standard(en_tokenizer): + text = "A base phrase should be recognized." + heads = [2, 1, 3, 2, 1, 0, -1] + tags = ['DT', 'JJ', 'NN', 'MD', 'VB', 'VBN', '.'] + deps = ['det', 'amod', 'nsubjpass', 'aux', 'auxpass', 'ROOT', 'punct'] - @pytest.fixture(scope="class") - def ex2_en(self, EN): - example = EN.tokenizer.tokens_from_list('A base phrase and a good phrase are often the same .'.split(' ')) - EN.tagger.tag_from_strings(example, 'DT NN NN CC DT JJ NN VBP RB DT JJ .'.split(' ')) - det,compound,nsubj,cc,amod,conj,root,advmod,attr,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubj','cc','amod','conj','root','advmod','attr','punct'] ) - example.from_array([HEAD, DEP], - numpy.asarray( - [ - [2, det], - [1, compound], - [5, nsubj], - [-1, cc], - [1, det], - [1, amod], - [-4, conj], - [0, root], - [-1, advmod], - [1, det], - [-3, attr], - [-4, punct] - ], dtype='int32')) - return example + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads) + chunks = list(doc.noun_chunks) + assert len(chunks) == 1 + assert chunks[0].text_with_ws == "A base phrase " - @pytest.fixture(scope="class") - def ex3_en(self, EN): - example = EN.tokenizer.tokens_from_list('A phrase with another phrase occurs .'.split(' ')) - EN.tagger.tag_from_strings(example, 'DT NN IN DT NN VBZ .'.split(' ')) - det,nsubj,prep,pobj,root,punct = tuple( EN.vocab.strings[l] for l in ['det','nsubj','prep','pobj','root','punct'] ) - example.from_array([HEAD, DEP], - numpy.asarray( - [ - [1, det], - [4, nsubj], - [-1, prep], - [1, det], - [-2, pobj], - [0, root], - [-1, punct] - ], dtype='int32')) - return example - @pytest.fixture(scope="class") - def ex1_de(self, DE): - example = DE.tokenizer.tokens_from_list('Eine Tasse steht auf dem Tisch .'.split(' ')) - DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN $.'.split(' ')) - nk,sb,root,mo,punct = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct']) - example.from_array([HEAD, DEP], - numpy.asarray( - [ - [1, nk], - [1, sb], - [0, root], - [-1, mo], - [1, nk], - [-2, nk], - [-3, punct] - ], dtype='int32')) - return example +def test_parser_noun_chunks_coordinated(en_tokenizer): + text = "A base phrase and a good phrase are often the same." + heads = [2, 1, 5, -1, 2, 1, -4, 0, -1, 1, -3, -4] + tags = ['DT', 'NN', 'NN', 'CC', 'DT', 'JJ', 'NN', 'VBP', 'RB', 'DT', 'JJ', '.'] + deps = ['det', 'compound', 'nsubj', 'cc', 'det', 'amod', 'conj', 'ROOT', 'advmod', 'det', 'attr', 'punct'] - @pytest.fixture(scope="class") - def ex2_de(self, DE): - example = DE.tokenizer.tokens_from_list('Die Sängerin singt mit einer Tasse Kaffee Arien .'.split(' ')) - DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN NN NN $.'.split(' ')) - nk,sb,root,mo,punct,oa = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct','oa']) - example.from_array([HEAD, DEP], - numpy.asarray( - [ - [1, nk], - [1, sb], - [0, root], - [-1, mo], - [1, nk], - [-2, nk], - [-1, nk], - [-5, oa], - [-6, punct] - ], dtype='int32')) - return example + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads) + chunks = list(doc.noun_chunks) + assert len(chunks) == 2 + assert chunks[0].text_with_ws == "A base phrase " + assert chunks[1].text_with_ws == "a good phrase " - def test_en_standard_chunk(self, ex1_en): - chunks = list(ex1_en.noun_chunks) - assert len(chunks) == 1 - assert chunks[0].string == 'A base phrase ' - def test_en_coordinated_chunks(self, ex2_en): - chunks = list(ex2_en.noun_chunks) - assert len(chunks) == 2 - assert chunks[0].string == 'A base phrase ' - assert chunks[1].string == 'a good phrase ' +def test_parser_noun_chunks_pp_chunks(en_tokenizer): + text = "A phrase with another phrase occurs." + heads = [1, 4, -1, 1, -2, 0, -1] + tags = ['DT', 'NN', 'IN', 'DT', 'NN', 'VBZ', '.'] + deps = ['det', 'nsubj', 'prep', 'det', 'pobj', 'ROOT', 'punct'] - def test_en_pp_chunks(self, ex3_en): - chunks = list(ex3_en.noun_chunks) - assert len(chunks) == 2 - assert chunks[0].string == 'A phrase ' - assert chunks[1].string == 'another phrase ' + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads) + chunks = list(doc.noun_chunks) + assert len(chunks) == 2 + assert chunks[0].text_with_ws == "A phrase " + assert chunks[1].text_with_ws == "another phrase " - def test_de_standard_chunk(self, ex1_de): - chunks = list(ex1_de.noun_chunks) - assert len(chunks) == 2 - assert chunks[0].string == 'Eine Tasse ' - assert chunks[1].string == 'dem Tisch ' - def test_de_extended_chunk(self, ex2_de): - chunks = list(ex2_de.noun_chunks) - assert len(chunks) == 3 - assert chunks[0].string == 'Die Sängerin ' - assert chunks[1].string == 'einer Tasse Kaffee ' - assert chunks[2].string == 'Arien ' +def test_parser_noun_chunks_standard_de(de_tokenizer): + text = "Eine Tasse steht auf dem Tisch." + heads = [1, 1, 0, -1, 1, -2, -4] + tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', '$.'] + deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'punct'] + + tokens = de_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads) + chunks = list(doc.noun_chunks) + assert len(chunks) == 2 + assert chunks[0].text_with_ws == "Eine Tasse " + assert chunks[1].text_with_ws == "dem Tisch " + + +def test_de_extended_chunk(de_tokenizer): + text = "Die Sängerin singt mit einer Tasse Kaffee Arien." + heads = [1, 1, 0, -1, 1, -2, -1, -5, -6] + tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', 'NN', 'NN', '$.'] + deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'nk', 'oa', 'punct'] + + tokens = de_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads) + chunks = list(doc.noun_chunks) + assert len(chunks) == 3 + assert chunks[0].text_with_ws == "Die Sängerin " + assert chunks[1].text_with_ws == "einer Tasse Kaffee " + assert chunks[2].text_with_ws == "Arien "