Modernise noun chunks tests and don't depend on models

2017-01-13 02:01:00 +01:00 · 2017-01-13 02:01:00 +01:00 · 49febd8c62
parent 3ee97b5686
commit 49febd8c62
1 changed files with 61 additions and 122 deletions
--- a/spacy/tests/parser/test_noun_chunks.py
+++ b/spacy/tests/parser/test_noun_chunks.py
@ -1,137 +1,76 @@
 # coding: utf-8
 from __future__ import unicode_literals

+from ..util import get_doc
+
 import pytest
-import numpy
-
-from ...attrs import HEAD, DEP


-@pytest.mark.models
-class TestNounChunks:
-    @pytest.fixture(scope="class")
-    def ex1_en(self, EN):
-        example = EN.tokenizer.tokens_from_list('A base phrase should be recognized .'.split(' '))
-        EN.tagger.tag_from_strings(example, 'DT NN NN MD VB VBN .'.split(' '))
-        det,compound,nsubjpass,aux,auxpass,root,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubjpass','aux','auxpass','root','punct'] )
-        example.from_array([HEAD, DEP],
-        numpy.asarray(
-            [
-                [2, det],
-                [1, compound],
-                [3, nsubjpass],
-                [2, aux],
-                [1, auxpass],
-                [0, root],
-                [-1, punct]
-            ], dtype='int32'))
-        return example
+def test_parser_noun_chunks_standard(en_tokenizer):
+    text = "A base phrase should be recognized."
+    heads = [2, 1, 3, 2, 1, 0, -1]
+    tags = ['DT', 'JJ', 'NN', 'MD', 'VB', 'VBN', '.']
+    deps = ['det', 'amod', 'nsubjpass', 'aux', 'auxpass', 'ROOT', 'punct']

-    @pytest.fixture(scope="class")
-    def ex2_en(self, EN):
-        example = EN.tokenizer.tokens_from_list('A base phrase and a good phrase are often the same .'.split(' '))
-        EN.tagger.tag_from_strings(example, 'DT NN NN CC DT JJ NN VBP RB DT JJ .'.split(' '))
-        det,compound,nsubj,cc,amod,conj,root,advmod,attr,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubj','cc','amod','conj','root','advmod','attr','punct'] )
-        example.from_array([HEAD, DEP],
-        numpy.asarray(
-            [
-                [2, det],
-                [1, compound],
-                [5, nsubj],
-                [-1, cc],
-                [1, det],
-                [1, amod],
-                [-4, conj],
-                [0, root],
-                [-1, advmod],
-                [1, det],
-                [-3, attr],
-                [-4, punct]
-            ], dtype='int32'))
-        return example
+    tokens = en_tokenizer(text)
+    doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 1
+    assert chunks[0].text_with_ws == "A base phrase "

-    @pytest.fixture(scope="class")
-    def ex3_en(self, EN):
-        example = EN.tokenizer.tokens_from_list('A phrase with another phrase occurs .'.split(' '))
-        EN.tagger.tag_from_strings(example, 'DT NN IN DT NN VBZ .'.split(' '))
-        det,nsubj,prep,pobj,root,punct = tuple( EN.vocab.strings[l] for l in ['det','nsubj','prep','pobj','root','punct'] )
-        example.from_array([HEAD, DEP],
-        numpy.asarray(
-            [
-                [1, det],
-                [4, nsubj],
-                [-1, prep],
-                [1, det],
-                [-2, pobj],
-                [0, root],
-                [-1, punct]
-            ], dtype='int32'))
-        return example

-    @pytest.fixture(scope="class")
-    def ex1_de(self, DE):
-        example = DE.tokenizer.tokens_from_list('Eine Tasse steht auf dem Tisch .'.split(' '))
-        DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN $.'.split(' '))
-        nk,sb,root,mo,punct = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct'])
-        example.from_array([HEAD, DEP],
-        numpy.asarray(
-            [
-                [1, nk],
-                [1, sb],
-                [0, root],
-                [-1, mo],
-                [1, nk],
-                [-2, nk],
-                [-3, punct]
-            ], dtype='int32'))
-        return example
+def test_parser_noun_chunks_coordinated(en_tokenizer):
+    text = "A base phrase and a good phrase are often the same."
+    heads = [2, 1, 5, -1, 2, 1, -4, 0, -1, 1, -3, -4]
+    tags = ['DT', 'NN', 'NN', 'CC', 'DT', 'JJ', 'NN', 'VBP', 'RB', 'DT', 'JJ', '.']
+    deps = ['det', 'compound', 'nsubj', 'cc', 'det', 'amod', 'conj', 'ROOT', 'advmod', 'det', 'attr', 'punct']

-    @pytest.fixture(scope="class")
-    def ex2_de(self, DE):
-        example = DE.tokenizer.tokens_from_list('Die Sängerin singt mit einer Tasse Kaffee Arien .'.split(' '))
-        DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN NN NN $.'.split(' '))
-        nk,sb,root,mo,punct,oa = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct','oa'])
-        example.from_array([HEAD, DEP],
-        numpy.asarray(
-            [
-                [1, nk],
-                [1, sb],
-                [0, root],
-                [-1, mo],
-                [1, nk],
-                [-2, nk],
-                [-1, nk],
-                [-5, oa],
-                [-6, punct]
-            ], dtype='int32'))
-        return example
+    tokens = en_tokenizer(text)
+    doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 2
+    assert chunks[0].text_with_ws == "A base phrase "
+    assert chunks[1].text_with_ws == "a good phrase "

-    def test_en_standard_chunk(self, ex1_en):
-        chunks = list(ex1_en.noun_chunks)
-        assert len(chunks) == 1
-        assert chunks[0].string == 'A base phrase '

-    def test_en_coordinated_chunks(self, ex2_en):
-        chunks = list(ex2_en.noun_chunks)
-        assert len(chunks) == 2
-        assert chunks[0].string == 'A base phrase '
-        assert chunks[1].string == 'a good phrase '
+def test_parser_noun_chunks_pp_chunks(en_tokenizer):
+    text = "A phrase with another phrase occurs."
+    heads = [1, 4, -1, 1, -2, 0, -1]
+    tags = ['DT', 'NN', 'IN', 'DT', 'NN', 'VBZ', '.']
+    deps = ['det', 'nsubj', 'prep', 'det', 'pobj', 'ROOT', 'punct']

-    def test_en_pp_chunks(self, ex3_en):
-        chunks = list(ex3_en.noun_chunks)
-        assert len(chunks) == 2
-        assert chunks[0].string == 'A phrase '
-        assert chunks[1].string == 'another phrase '
+    tokens = en_tokenizer(text)
+    doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 2
+    assert chunks[0].text_with_ws == "A phrase "
+    assert chunks[1].text_with_ws == "another phrase "

-    def test_de_standard_chunk(self, ex1_de):
-        chunks = list(ex1_de.noun_chunks)
-        assert len(chunks) == 2
-        assert chunks[0].string == 'Eine Tasse '
-        assert chunks[1].string == 'dem Tisch '

-    def test_de_extended_chunk(self, ex2_de):
-        chunks = list(ex2_de.noun_chunks)
-        assert len(chunks) == 3
-        assert chunks[0].string == 'Die Sängerin '
-        assert chunks[1].string == 'einer Tasse Kaffee '
-        assert chunks[2].string == 'Arien '
+def test_parser_noun_chunks_standard_de(de_tokenizer):
+    text = "Eine Tasse steht auf dem Tisch."
+    heads = [1, 1, 0, -1, 1, -2, -4]
+    tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', '$.']
+    deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'punct']
+
+    tokens = de_tokenizer(text)
+    doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 2
+    assert chunks[0].text_with_ws == "Eine Tasse "
+    assert chunks[1].text_with_ws == "dem Tisch "
+
+
+def test_de_extended_chunk(de_tokenizer):
+    text = "Die Sängerin singt mit einer Tasse Kaffee Arien."
+    heads = [1, 1, 0, -1, 1, -2, -1, -5, -6]
+    tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', 'NN', 'NN', '$.']
+    deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'nk', 'oa', 'punct']
+
+    tokens = de_tokenizer(text)
+    doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) == 3
+    assert chunks[0].text_with_ws == "Die Sängerin "
+    assert chunks[1].text_with_ws == "einer Tasse Kaffee "
+    assert chunks[2].text_with_ws == "Arien "