spaCy/spacy/tests/lang/nl/test_noun_chunks.py

import pytest
from spacy.tokens import Doc
from spacy.util import filter_spans


@pytest.fixture
def nl_sample(nl_vocab):
    # TEXT :
    # Haar vriend lacht luid. We kregen alweer ruzie toen we de supermarkt ingingen.
    # Aan het begin van de supermarkt is al het fruit en de groentes. Uiteindelijk hebben we dan ook
    # geen avondeten gekocht.
    words = [
        "Haar",
        "vriend",
        "lacht",
        "luid",
        ".",
        "We",
        "kregen",
        "alweer",
        "ruzie",
        "toen",
        "we",
        "de",
        "supermarkt",
        "ingingen",
        ".",
        "Aan",
        "het",
        "begin",
        "van",
        "de",
        "supermarkt",
        "is",
        "al",
        "het",
        "fruit",
        "en",
        "de",
        "groentes",
        ".",
        "Uiteindelijk",
        "hebben",
        "we",
        "dan",
        "ook",
        "geen",
        "avondeten",
        "gekocht",
        ".",
    ]
    heads = [
        1,
        2,
        2,
        2,
        2,
        6,
        6,
        6,
        6,
        13,
        13,
        12,
        13,
        6,
        6,
        17,
        17,
        24,
        20,
        20,
        17,
        24,
        24,
        24,
        24,
        27,
        27,
        24,
        24,
        36,
        36,
        36,
        36,
        36,
        35,
        36,
        36,
        36,
    ]
    deps = [
        "nmod:poss",
        "nsubj",
        "ROOT",
        "advmod",
        "punct",
        "nsubj",
        "ROOT",
        "advmod",
        "obj",
        "mark",
        "nsubj",
        "det",
        "obj",
        "advcl",
        "punct",
        "case",
        "det",
        "obl",
        "case",
        "det",
        "nmod",
        "cop",
        "advmod",
        "det",
        "ROOT",
        "cc",
        "det",
        "conj",
        "punct",
        "advmod",
        "aux",
        "nsubj",
        "advmod",
        "advmod",
        "det",
        "obj",
        "ROOT",
        "punct",
    ]
    pos = [
        "PRON",
        "NOUN",
        "VERB",
        "ADJ",
        "PUNCT",
        "PRON",
        "VERB",
        "ADV",
        "NOUN",
        "SCONJ",
        "PRON",
        "DET",
        "NOUN",
        "NOUN",
        "PUNCT",
        "ADP",
        "DET",
        "NOUN",
        "ADP",
        "DET",
        "NOUN",
        "AUX",
        "ADV",
        "DET",
        "NOUN",
        "CCONJ",
        "DET",
        "NOUN",
        "PUNCT",
        "ADJ",
        "AUX",
        "PRON",
        "ADV",
        "ADV",
        "DET",
        "NOUN",
        "VERB",
        "PUNCT",
    ]
    return Doc(nl_vocab, words=words, heads=heads, deps=deps, pos=pos)


@pytest.fixture
def nl_reference_chunking():
    # Using frog https://github.com/LanguageMachines/frog/ we obtain the following NOUN-PHRASES:
    return [
        "haar vriend",
        "we",
        "ruzie",
        "we",
        "de supermarkt",
        "het begin",
        "de supermarkt",
        "het fruit",
        "de groentes",
        "we",
        "geen avondeten",
    ]


def test_need_dep(nl_tokenizer):
    """
    Test that noun_chunks raises Value Error for 'nl' language if Doc is not parsed.
    """
    txt = "Haar vriend lacht luid."
    doc = nl_tokenizer(txt)

    with pytest.raises(ValueError):
        list(doc.noun_chunks)


def test_chunking(nl_sample, nl_reference_chunking):
    """
    Test the noun chunks of a sample text. Uses a sample.
    The sample text simulates a Doc object as would be produced by nl_core_news_md.
    """
    chunks = [s.text.lower() for s in nl_sample.noun_chunks]
    assert chunks == nl_reference_chunking


@pytest.mark.issue(10846)
def test_no_overlapping_chunks(nl_vocab):
    # fmt: off
    doc = Doc(
        nl_vocab,
        words=["Dit", "programma", "wordt", "beschouwd", "als", "'s", "werelds", "eerste", "computerprogramma"],
        deps=["det", "nsubj:pass", "aux:pass", "ROOT", "mark", "det", "fixed", "amod", "xcomp"],
        heads=[1, 3, 3, 3, 8, 8, 5, 8, 3],
        pos=["DET", "NOUN", "AUX", "VERB", "SCONJ", "DET", "NOUN", "ADJ", "NOUN"],
    )
    # fmt: on
    chunks = list(doc.noun_chunks)
    assert filter_spans(chunks) == chunks
Adding noun_chunks to the DUTCH language model (nl) (#8529) * :sparkles: implement noun_chunks for dutch language * copy/paste FR and SV syntax iterators to accomodate UD tags * added tests with dutch text * signed contributor agreement * :bug: fix noun chunks generator * built from scratch * define noun chunk as a single Noun-Phrase * includes some corner cases debugging (incorrect POS tagging) * test with provided annotated sample (POS, DEP) * :white_check_mark: fix failing test * CI pipeline did not like the added sample file * add the sample as a pytest fixture * Update spacy/lang/nl/syntax_iterators.py * Update spacy/lang/nl/syntax_iterators.py Code readability Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/tests/lang/nl/test_noun_chunks.py correct comment Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * finalize code * change "if next_word" into "if next_word is not None" Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> 2021-07-14 12:01:02 +00:00			`import pytest`
Fix Dutch noun chunks to skip overlapping spans (#11275) * Add test for overlapping noun chunks * Skip overlapping noun chunks * Update spacy/tests/lang/nl/test_noun_chunks.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> 2022-08-10 07:49:08 +00:00			`from spacy.tokens import Doc`
			`from spacy.util import filter_spans`
Adding noun_chunks to the DUTCH language model (nl) (#8529) * :sparkles: implement noun_chunks for dutch language * copy/paste FR and SV syntax iterators to accomodate UD tags * added tests with dutch text * signed contributor agreement * :bug: fix noun chunks generator * built from scratch * define noun chunk as a single Noun-Phrase * includes some corner cases debugging (incorrect POS tagging) * test with provided annotated sample (POS, DEP) * :white_check_mark: fix failing test * CI pipeline did not like the added sample file * add the sample as a pytest fixture * Update spacy/lang/nl/syntax_iterators.py * Update spacy/lang/nl/syntax_iterators.py Code readability Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/tests/lang/nl/test_noun_chunks.py correct comment Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * finalize code * change "if next_word" into "if next_word is not None" Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> 2021-07-14 12:01:02 +00:00

			`@pytest.fixture`
			`def nl_sample(nl_vocab):`
			`# TEXT :`
			`# Haar vriend lacht luid. We kregen alweer ruzie toen we de supermarkt ingingen.`
			`# Aan het begin van de supermarkt is al het fruit en de groentes. Uiteindelijk hebben we dan ook`
			`# geen avondeten gekocht.`
			`words = [`
			`"Haar",`
			`"vriend",`
			`"lacht",`
			`"luid",`
			`".",`
			`"We",`
			`"kregen",`
			`"alweer",`
			`"ruzie",`
			`"toen",`
			`"we",`
			`"de",`
			`"supermarkt",`
			`"ingingen",`
			`".",`
			`"Aan",`
			`"het",`
			`"begin",`
			`"van",`
			`"de",`
			`"supermarkt",`
			`"is",`
			`"al",`
			`"het",`
			`"fruit",`
			`"en",`
			`"de",`
			`"groentes",`
			`".",`
			`"Uiteindelijk",`
			`"hebben",`
			`"we",`
			`"dan",`
			`"ook",`
			`"geen",`
			`"avondeten",`
			`"gekocht",`
			`".",`
			`]`
			`heads = [`
			`1,`
			`2,`
			`2,`
			`2,`
			`2,`
			`6,`
			`6,`
			`6,`
			`6,`
			`13,`
			`13,`
			`12,`
			`13,`
			`6,`
			`6,`
			`17,`
			`17,`
			`24,`
			`20,`
			`20,`
			`17,`
			`24,`
			`24,`
			`24,`
			`24,`
			`27,`
			`27,`
			`24,`
			`24,`
			`36,`
			`36,`
			`36,`
			`36,`
			`36,`
			`35,`
			`36,`
			`36,`
			`36,`
			`]`
			`deps = [`
			`"nmod:poss",`
			`"nsubj",`
			`"ROOT",`
			`"advmod",`
			`"punct",`
			`"nsubj",`
			`"ROOT",`
			`"advmod",`
			`"obj",`
			`"mark",`
			`"nsubj",`
			`"det",`
			`"obj",`
			`"advcl",`
			`"punct",`
			`"case",`
			`"det",`
			`"obl",`
			`"case",`
			`"det",`
			`"nmod",`
			`"cop",`
			`"advmod",`
			`"det",`
			`"ROOT",`
			`"cc",`
			`"det",`
			`"conj",`
			`"punct",`
			`"advmod",`
			`"aux",`
			`"nsubj",`
			`"advmod",`
			`"advmod",`
			`"det",`
			`"obj",`
			`"ROOT",`
			`"punct",`
			`]`
			`pos = [`
			`"PRON",`
			`"NOUN",`
			`"VERB",`
			`"ADJ",`
			`"PUNCT",`
			`"PRON",`
			`"VERB",`
			`"ADV",`
			`"NOUN",`
			`"SCONJ",`
			`"PRON",`
			`"DET",`
			`"NOUN",`
			`"NOUN",`
			`"PUNCT",`
			`"ADP",`
			`"DET",`
			`"NOUN",`
			`"ADP",`
			`"DET",`
			`"NOUN",`
			`"AUX",`
			`"ADV",`
			`"DET",`
			`"NOUN",`
			`"CCONJ",`
			`"DET",`
			`"NOUN",`
			`"PUNCT",`
			`"ADJ",`
			`"AUX",`
			`"PRON",`
			`"ADV",`
			`"ADV",`
			`"DET",`
			`"NOUN",`
			`"VERB",`
			`"PUNCT",`
			`]`
			`return Doc(nl_vocab, words=words, heads=heads, deps=deps, pos=pos)`


			`@pytest.fixture`
			`def nl_reference_chunking():`
			`# Using frog https://github.com/LanguageMachines/frog/ we obtain the following NOUN-PHRASES:`
			`return [`
			`"haar vriend",`
			`"we",`
			`"ruzie",`
			`"we",`
			`"de supermarkt",`
			`"het begin",`
			`"de supermarkt",`
			`"het fruit",`
			`"de groentes",`
			`"we",`
			`"geen avondeten",`
			`]`


			`def test_need_dep(nl_tokenizer):`
			`"""`
			`Test that noun_chunks raises Value Error for 'nl' language if Doc is not parsed.`
			`"""`
			`txt = "Haar vriend lacht luid."`
			`doc = nl_tokenizer(txt)`

			`with pytest.raises(ValueError):`
			`list(doc.noun_chunks)`


			`def test_chunking(nl_sample, nl_reference_chunking):`
			`"""`
			`Test the noun chunks of a sample text. Uses a sample.`
			`The sample text simulates a Doc object as would be produced by nl_core_news_md.`
			`"""`
			`chunks = [s.text.lower() for s in nl_sample.noun_chunks]`
			`assert chunks == nl_reference_chunking`
Fix Dutch noun chunks to skip overlapping spans (#11275) * Add test for overlapping noun chunks * Skip overlapping noun chunks * Update spacy/tests/lang/nl/test_noun_chunks.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> 2022-08-10 07:49:08 +00:00

			`@pytest.mark.issue(10846)`
			`def test_no_overlapping_chunks(nl_vocab):`
			`# fmt: off`
			`doc = Doc(`
			`nl_vocab,`
			`words=["Dit", "programma", "wordt", "beschouwd", "als", "'s", "werelds", "eerste", "computerprogramma"],`
			`deps=["det", "nsubj:pass", "aux:pass", "ROOT", "mark", "det", "fixed", "amod", "xcomp"],`
			`heads=[1, 3, 3, 3, 8, 8, 5, 8, 3],`
			`pos=["DET", "NOUN", "AUX", "VERB", "SCONJ", "DET", "NOUN", "ADJ", "NOUN"],`
			`)`
			`# fmt: on`
			`chunks = list(doc.noun_chunks)`
			`assert filter_spans(chunks) == chunks`