spaCy/spacy/tests/unit/test_parser.py

# -*- coding: utf-8 -*-

from __future__ import unicode_literals

import pytest
import numpy

from spacy.attrs import HEAD, DEP


@pytest.mark.models
class TestNounChunks:
    @pytest.fixture(scope="class")
    def ex1_en(self, EN):
        example = EN.tokenizer.tokens_from_list('A base phrase should be recognized .'.split(' '))
        EN.tagger.tag_from_strings(example, 'DT NN NN MD VB VBN .'.split(' '))
        det,compound,nsubjpass,aux,auxpass,root,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubjpass','aux','auxpass','root','punct'] )
        example.from_array([HEAD, DEP],
        numpy.asarray(
            [
                [2, det],
                [1, compound],
                [3, nsubjpass],
                [2, aux],
                [1, auxpass],
                [0, root],
                [-1, punct]
            ], dtype='int32'))
        return example

    @pytest.fixture(scope="class")
    def ex2_en(self, EN):
        example = EN.tokenizer.tokens_from_list('A base phrase and a good phrase are often the same .'.split(' '))
        EN.tagger.tag_from_strings(example, 'DT NN NN CC DT JJ NN VBP RB DT JJ .'.split(' '))
        det,compound,nsubj,cc,amod,conj,root,advmod,attr,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubj','cc','amod','conj','root','advmod','attr','punct'] )
        example.from_array([HEAD, DEP],
        numpy.asarray(
            [
                [2, det],
                [1, compound],
                [5, nsubj],
                [-1, cc],
                [1, det],
                [1, amod],
                [-4, conj],
                [0, root],
                [-1, advmod],
                [1, det],
                [-3, attr],
                [-4, punct]
            ], dtype='int32'))
        return example

    @pytest.fixture(scope="class")
    def ex3_en(self, EN):
        example = EN.tokenizer.tokens_from_list('A phrase with another phrase occurs .'.split(' '))
        EN.tagger.tag_from_strings(example, 'DT NN IN DT NN VBZ .'.split(' '))
        det,nsubj,prep,pobj,root,punct = tuple( EN.vocab.strings[l] for l in ['det','nsubj','prep','pobj','root','punct'] )
        example.from_array([HEAD, DEP],
        numpy.asarray(
            [
                [1, det],
                [4, nsubj],
                [-1, prep],
                [1, det],
                [-2, pobj],
                [0, root],
                [-1, punct]
            ], dtype='int32'))
        return example

    @pytest.fixture(scope="class")
    def ex1_de(self, DE):
        example = DE.tokenizer.tokens_from_list('Eine Tasse steht auf dem Tisch .'.split(' '))
        DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN $.'.split(' '))
        nk,sb,root,mo,punct = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct'])
        example.from_array([HEAD, DEP],
        numpy.asarray(
            [
                [1, nk],
                [1, sb],
                [0, root],
                [-1, mo],
                [1, nk],
                [-2, nk],
                [-3, punct]
            ], dtype='int32'))
        return example

    @pytest.fixture(scope="class")
    def ex2_de(self, DE):
        example = DE.tokenizer.tokens_from_list('Die Sängerin singt mit einer Tasse Kaffee Arien .'.split(' '))
        DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN NN NN $.'.split(' '))
        nk,sb,root,mo,punct,oa = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct','oa'])
        example.from_array([HEAD, DEP],
        numpy.asarray(
            [
                [1, nk],
                [1, sb],
                [0, root],
                [-1, mo],
                [1, nk],
                [-2, nk],
                [-1, nk],
                [-5, oa],
                [-6, punct]
            ], dtype='int32'))
        return example

    def test_en_standard_chunk(self, ex1_en):
        chunks = list(ex1_en.noun_chunks)
        assert len(chunks) == 1
        assert chunks[0].string == 'A base phrase '

    def test_en_coordinated_chunks(self, ex2_en):
        chunks = list(ex2_en.noun_chunks)
        assert len(chunks) == 2
        assert chunks[0].string == 'A base phrase '
        assert chunks[1].string == 'a good phrase '

    def test_en_pp_chunks(self, ex3_en):
        chunks = list(ex3_en.noun_chunks)
        assert len(chunks) == 2
        assert chunks[0].string == 'A phrase '
        assert chunks[1].string == 'another phrase '

    def test_de_standard_chunk(self, ex1_de):
        chunks = list(ex1_de.noun_chunks)
        assert len(chunks) == 2
        assert chunks[0].string == 'Eine Tasse '
        assert chunks[1].string == 'dem Tisch '

    def test_de_extended_chunk(self, ex2_de):
        chunks = list(ex2_de.noun_chunks)
        assert len(chunks) == 3
        assert chunks[0].string == 'Die Sängerin '
        assert chunks[1].string == 'einer Tasse Kaffee '
        assert chunks[2].string == 'Arien '
German noun chunk iterator now doesn't return tokens more than once 2016-05-03 14:58:59 +00:00			`# -- coding: utf-8 --`

reformulate noun chunk tests for English 2016-05-03 12:24:35 +00:00			`from __future__ import unicode_literals`

			`import pytest`
			`import numpy`

			`from spacy.attrs import HEAD, DEP`


			`@pytest.mark.models`
			`class TestNounChunks:`
			`@pytest.fixture(scope="class")`
			`def ex1_en(self, EN):`
			`example = EN.tokenizer.tokens_from_list('A base phrase should be recognized .'.split(' '))`
			`EN.tagger.tag_from_strings(example, 'DT NN NN MD VB VBN .'.split(' '))`
German noun chunk iterator now doesn't return tokens more than once 2016-05-03 14:58:59 +00:00			`det,compound,nsubjpass,aux,auxpass,root,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubjpass','aux','auxpass','root','punct'] )`
reformulate noun chunk tests for English 2016-05-03 12:24:35 +00:00			`example.from_array([HEAD, DEP],`
			`numpy.asarray(`
			`[`
			`[2, det],`
			`[1, compound],`
			`[3, nsubjpass],`
			`[2, aux],`
			`[1, auxpass],`
			`[0, root],`
			`[-1, punct]`
			`], dtype='int32'))`
			`return example`

			`@pytest.fixture(scope="class")`
			`def ex2_en(self, EN):`
			`example = EN.tokenizer.tokens_from_list('A base phrase and a good phrase are often the same .'.split(' '))`
			`EN.tagger.tag_from_strings(example, 'DT NN NN CC DT JJ NN VBP RB DT JJ .'.split(' '))`
German noun chunk iterator now doesn't return tokens more than once 2016-05-03 14:58:59 +00:00			`det,compound,nsubj,cc,amod,conj,root,advmod,attr,punct = tuple( EN.vocab.strings[l] for l in ['det','compound','nsubj','cc','amod','conj','root','advmod','attr','punct'] )`
reformulate noun chunk tests for English 2016-05-03 12:24:35 +00:00			`example.from_array([HEAD, DEP],`
			`numpy.asarray(`
			`[`
			`[2, det],`
			`[1, compound],`
			`[5, nsubj],`
			`[-1, cc],`
			`[1, det],`
			`[1, amod],`
			`[-4, conj],`
			`[0, root],`
			`[-1, advmod],`
			`[1, det],`
			`[-3, attr],`
			`[-4, punct]`
			`], dtype='int32'))`
			`return example`

			`@pytest.fixture(scope="class")`
			`def ex3_en(self, EN):`
			`example = EN.tokenizer.tokens_from_list('A phrase with another phrase occurs .'.split(' '))`
			`EN.tagger.tag_from_strings(example, 'DT NN IN DT NN VBZ .'.split(' '))`
German noun chunk iterator now doesn't return tokens more than once 2016-05-03 14:58:59 +00:00			`det,nsubj,prep,pobj,root,punct = tuple( EN.vocab.strings[l] for l in ['det','nsubj','prep','pobj','root','punct'] )`
reformulate noun chunk tests for English 2016-05-03 12:24:35 +00:00			`example.from_array([HEAD, DEP],`
			`numpy.asarray(`
			`[`
			`[1, det],`
			`[4, nsubj],`
			`[-1, prep],`
			`[1, det],`
			`[-2, pobj],`
			`[0, root],`
			`[-1, punct]`
			`], dtype='int32'))`
			`return example`

German noun chunk iterator now doesn't return tokens more than once 2016-05-03 14:58:59 +00:00			`@pytest.fixture(scope="class")`
			`def ex1_de(self, DE):`
			`example = DE.tokenizer.tokens_from_list('Eine Tasse steht auf dem Tisch .'.split(' '))`
			`DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN $.'.split(' '))`
			`nk,sb,root,mo,punct = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct'])`
			`example.from_array([HEAD, DEP],`
			`numpy.asarray(`
			`[`
			`[1, nk],`
			`[1, sb],`
			`[0, root],`
			`[-1, mo],`
			`[1, nk],`
			`[-2, nk],`
			`[-3, punct]`
			`], dtype='int32'))`
			`return example`

			`@pytest.fixture(scope="class")`
			`def ex2_de(self, DE):`
			`example = DE.tokenizer.tokens_from_list('Die Sängerin singt mit einer Tasse Kaffee Arien .'.split(' '))`
			`DE.tagger.tag_from_strings(example, 'ART NN VVFIN APPR ART NN NN NN $.'.split(' '))`
			`nk,sb,root,mo,punct,oa = tuple( DE.vocab.strings[l] for l in ['nk','sb','root','mo','punct','oa'])`
			`example.from_array([HEAD, DEP],`
			`numpy.asarray(`
			`[`
			`[1, nk],`
			`[1, sb],`
			`[0, root],`
			`[-1, mo],`
			`[1, nk],`
			`[-2, nk],`
			`[-1, nk],`
			`[-5, oa],`
			`[-6, punct]`
			`], dtype='int32'))`
			`return example`
add tests for German noun chunker 2016-05-03 13:01:28 +00:00
			`def test_en_standard_chunk(self, ex1_en):`
reformulate noun chunk tests for English 2016-05-03 12:24:35 +00:00			`chunks = list(ex1_en.noun_chunks)`
			`assert len(chunks) == 1`
			`assert chunks[0].string == 'A base phrase '`

add tests for German noun chunker 2016-05-03 13:01:28 +00:00			`def test_en_coordinated_chunks(self, ex2_en):`
reformulate noun chunk tests for English 2016-05-03 12:24:35 +00:00			`chunks = list(ex2_en.noun_chunks)`
			`assert len(chunks) == 2`
			`assert chunks[0].string == 'A base phrase '`
			`assert chunks[1].string == 'a good phrase '`

add tests for German noun chunker 2016-05-03 13:01:28 +00:00			`def test_en_pp_chunks(self, ex3_en):`
reformulate noun chunk tests for English 2016-05-03 12:24:35 +00:00			`chunks = list(ex3_en.noun_chunks)`
			`assert len(chunks) == 2`
			`assert chunks[0].string == 'A phrase '`
			`assert chunks[1].string == 'another phrase '`
German noun chunk iterator now doesn't return tokens more than once 2016-05-03 14:58:59 +00:00
			`def test_de_standard_chunk(self, ex1_de):`
			`chunks = list(ex1_de.noun_chunks)`
			`assert len(chunks) == 2`
			`assert chunks[0].string == 'Eine Tasse '`
			`assert chunks[1].string == 'dem Tisch '`

			`def test_de_extended_chunk(self, ex2_de):`
			`chunks = list(ex2_de.noun_chunks)`
			`assert len(chunks) == 3`
			`assert chunks[0].string == 'Die Sängerin '`
			`assert chunks[1].string == 'einer Tasse Kaffee '`
			`assert chunks[2].string == 'Arien '`