spaCy/spacy/tests/test_cli.py

# coding: utf-8
from __future__ import unicode_literals

import pytest

from spacy.lang.en import English
from spacy.cli.converters import conllu2json, iob2json, conll_ner2json
from spacy.cli.pretrain import make_docs


def test_cli_converters_conllu2json():
    # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
    lines = [
        "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO",
        "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER",
        "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tI-PER",
        "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO",
    ]
    input_data = "\n".join(lines)
    converted = conllu2json(input_data, n_sents=1)
    assert len(converted) == 1
    assert converted[0]["id"] == 0
    assert len(converted[0]["paragraphs"]) == 1
    assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
    sent = converted[0]["paragraphs"][0]["sentences"][0]
    assert len(sent["tokens"]) == 4
    tokens = sent["tokens"]
    assert [t["orth"] for t in tokens] == ["Dommer", "Finn", "Eilertsen", "avstår"]
    assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"]
    assert [t["head"] for t in tokens] == [1, 2, -1, 0]
    assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"]
    assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]


def test_cli_converters_iob2json():
    lines = [
        "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
        "I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
        "I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
        "I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
    ]
    input_data = "\n".join(lines)
    converted = iob2json(input_data, n_sents=10)
    assert len(converted) == 1
    assert converted[0]["id"] == 0
    assert len(converted[0]["paragraphs"]) == 1
    assert len(converted[0]["paragraphs"][0]["sentences"]) == 4
    for i in range(0, 4):
        sent = converted[0]["paragraphs"][0]["sentences"][i]
        assert len(sent["tokens"]) == 8
        tokens = sent["tokens"]
        assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
        assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]


def test_cli_converters_conll_ner2json():
    lines = [
        "-DOCSTART- -X- O O",
        "",
        "I\tO",
        "like\tO",
        "London\tB-GPE",
        "and\tO",
        "New\tB-GPE",
        "York\tI-GPE",
        "City\tI-GPE",
        ".\tO",
        "",
        "I O",
        "like O",
        "London B-GPE",
        "and O",
        "New B-GPE",
        "York I-GPE",
        "City I-GPE",
        ". O",
        "",
        "I PRP O",
        "like VBP O",
        "London NNP B-GPE",
        "and CC O",
        "New NNP B-GPE",
        "York NNP I-GPE",
        "City NNP I-GPE",
        ". . O",
        "",
        "I PRP _ O",
        "like VBP _ O",
        "London NNP _ B-GPE",
        "and CC _ O",
        "New NNP _ B-GPE",
        "York NNP _ I-GPE",
        "City NNP _ I-GPE",
        ". . _ O",
        "",
        "I\tPRP\t_\tO",
        "like\tVBP\t_\tO",
        "London\tNNP\t_\tB-GPE",
        "and\tCC\t_\tO",
        "New\tNNP\t_\tB-GPE",
        "York\tNNP\t_\tI-GPE",
        "City\tNNP\t_\tI-GPE",
        ".\t.\t_\tO",
    ]
    input_data = "\n".join(lines)
    converted = conll_ner2json(input_data, n_sents=10)
    print(converted)
    assert len(converted) == 1
    assert converted[0]["id"] == 0
    assert len(converted[0]["paragraphs"]) == 1
    assert len(converted[0]["paragraphs"][0]["sentences"]) == 5
    for i in range(0, 5):
        sent = converted[0]["paragraphs"][0]["sentences"][i]
        assert len(sent["tokens"]) == 8
        tokens = sent["tokens"]
        assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
        assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]


def test_pretrain_make_docs():
    nlp = English()

    valid_jsonl_text = {"text": "Some text"}
    docs, skip_count = make_docs(nlp, [valid_jsonl_text], 1, 10)
    assert len(docs) == 1
    assert skip_count == 0

    valid_jsonl_tokens = {"tokens": ["Some", "tokens"]}
    docs, skip_count = make_docs(nlp, [valid_jsonl_tokens], 1, 10)
    assert len(docs) == 1
    assert skip_count == 0

    invalid_jsonl_type = 0
    with pytest.raises(TypeError):
        make_docs(nlp, [invalid_jsonl_type], 1, 100)

    invalid_jsonl_key = {"invalid": "Does not matter"}
    with pytest.raises(ValueError):
        make_docs(nlp, [invalid_jsonl_key], 1, 100)

    empty_jsonl_text = {"text": ""}
    docs, skip_count = make_docs(nlp, [empty_jsonl_text], 1, 10)
    assert len(docs) == 0
    assert skip_count == 1

    empty_jsonl_tokens = {"tokens": []}
    docs, skip_count = make_docs(nlp, [empty_jsonl_tokens], 1, 10)
    assert len(docs) == 0
    assert skip_count == 1

    too_short_jsonl = {"text": "This text is not long enough"}
    docs, skip_count = make_docs(nlp, [too_short_jsonl], 10, 15)
    assert len(docs) == 0
    assert skip_count == 0

    too_long_jsonl = {"text": "This text contains way too much tokens for this test"}
    docs, skip_count = make_docs(nlp, [too_long_jsonl], 1, 5)
    assert len(docs) == 0
    assert skip_count == 0
Port over #2949 2018-11-26 17:54:27 +00:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

Update error raising for CLI pretrain to fix #3840 (#3843) * Add check for empty input file to CLI pretrain * Raise error if JSONL is not a dict or contains neither `tokens` nor `text` key * Skip empty values for correct pretrain keys and log a counter as warning * Add tests for CLI pretrain core function make_docs. * Add a short hint for the `tokens` key to the CLI pretrain docs * Add success message to CLI pretrain * Update model loading to fix the tests * Skip empty values and do not create docs out of it 2019-06-16 11:22:57 +00:00			`import pytest`

			`from spacy.lang.en import English`
Updates/bugfixes for NER/IOB converters (#4186) * Updates/bugfixes for NER/IOB converters * Converter formats `ner` and `iob` use autodetect to choose a converter if possible * `iob2json` is reverted to handle sentence-per-line data like `word1\|pos1\|ent1 word2\|pos2\|ent2` * Fix bug in `merge_sentences()` so the second sentence in each batch isn't skipped * `conll_ner2json` is made more general so it can handle more formats with whitespace-separated columns * Supports all formats where the first column is the token and the final column is the IOB tag; if present, the second column is the POS tag * As in CoNLL 2003 NER, blank lines separate sentences, `-DOCSTART- -X- O O` separates documents * Add option for segmenting sentences (new flag `-s`) * Parser-based sentence segmentation with a provided model, otherwise with sentencizer (new option `-b` to specify model) * Can group sentences into documents with `n_sents` as long as sentence segmentation is available * Only applies automatic segmentation when there are no existing delimiters in the data * Provide info about settings applied during conversion with warnings and suggestions if settings conflict or might not be not optimal. * Add tests for common formats * Add '(default)' back to docs for -c auto * Add document count back to output * Revert changes to converter output message * Use explicit tabs in convert CLI test data * Adjust/add messages for n_sents=1 default * Add sample NER data to training examples * Update README * Add links in docs to example NER data * Define msg within converters 2019-08-29 10:04:01 +00:00			`from spacy.cli.converters import conllu2json, iob2json, conll_ner2json`
Update error raising for CLI pretrain to fix #3840 (#3843) * Add check for empty input file to CLI pretrain * Raise error if JSONL is not a dict or contains neither `tokens` nor `text` key * Skip empty values for correct pretrain keys and log a counter as warning * Add tests for CLI pretrain core function make_docs. * Add a short hint for the `tokens` key to the CLI pretrain docs * Add success message to CLI pretrain * Update model loading to fix the tests * Skip empty values and do not create docs out of it 2019-06-16 11:22:57 +00:00			`from spacy.cli.pretrain import make_docs`
Merging conversion scripts for conll formats (#3405) * merging conllu/conll and conllubio scripts * tabs to spaces * removing conllubio2json from converters/__init__.py * Move not-really-CLI tests to misc * Add converter test using no-ud data * Fix test I broke * removing include_biluo parameter * fixing read_conllx * remove include_biluo from convert.py 2019-03-15 17:14:46 +00:00

			`def test_cli_converters_conllu2json():`
			`# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu`
			`lines = [`
			`"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind\|Gender=Masc\|Number=Sing\t2\tappos\t_\tO",`
			`"2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER",`
			`"3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tI-PER",`
			`"4\tavstår\tavstå\tVERB\t_\tMood=Ind\|Tense=Pres\|VerbForm=Fin\t0\troot\t_\tO",`
			`]`
			`input_data = "\n".join(lines)`
			`converted = conllu2json(input_data, n_sents=1)`
			`assert len(converted) == 1`
			`assert converted[0]["id"] == 0`
			`assert len(converted[0]["paragraphs"]) == 1`
			`assert len(converted[0]["paragraphs"][0]["sentences"]) == 1`
			`sent = converted[0]["paragraphs"][0]["sentences"][0]`
			`assert len(sent["tokens"]) == 4`
			`tokens = sent["tokens"]`
			`assert [t["orth"] for t in tokens] == ["Dommer", "Finn", "Eilertsen", "avstår"]`
			`assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"]`
			`assert [t["head"] for t in tokens] == [1, 2, -1, 0]`
			`assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"]`
			`assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]`
Update error raising for CLI pretrain to fix #3840 (#3843) * Add check for empty input file to CLI pretrain * Raise error if JSONL is not a dict or contains neither `tokens` nor `text` key * Skip empty values for correct pretrain keys and log a counter as warning * Add tests for CLI pretrain core function make_docs. * Add a short hint for the `tokens` key to the CLI pretrain docs * Add success message to CLI pretrain * Update model loading to fix the tests * Skip empty values and do not create docs out of it 2019-06-16 11:22:57 +00:00

Updates/bugfixes for NER/IOB converters (#4186) * Updates/bugfixes for NER/IOB converters * Converter formats `ner` and `iob` use autodetect to choose a converter if possible * `iob2json` is reverted to handle sentence-per-line data like `word1\|pos1\|ent1 word2\|pos2\|ent2` * Fix bug in `merge_sentences()` so the second sentence in each batch isn't skipped * `conll_ner2json` is made more general so it can handle more formats with whitespace-separated columns * Supports all formats where the first column is the token and the final column is the IOB tag; if present, the second column is the POS tag * As in CoNLL 2003 NER, blank lines separate sentences, `-DOCSTART- -X- O O` separates documents * Add option for segmenting sentences (new flag `-s`) * Parser-based sentence segmentation with a provided model, otherwise with sentencizer (new option `-b` to specify model) * Can group sentences into documents with `n_sents` as long as sentence segmentation is available * Only applies automatic segmentation when there are no existing delimiters in the data * Provide info about settings applied during conversion with warnings and suggestions if settings conflict or might not be not optimal. * Add tests for common formats * Add '(default)' back to docs for -c auto * Add document count back to output * Revert changes to converter output message * Use explicit tabs in convert CLI test data * Adjust/add messages for n_sents=1 default * Add sample NER data to training examples * Update README * Add links in docs to example NER data * Define msg within converters 2019-08-29 10:04:01 +00:00			`def test_cli_converters_iob2json():`
			`lines = [`
			`"I\|O like\|O London\|I-GPE and\|O New\|B-GPE York\|I-GPE City\|I-GPE .\|O",`
			`"I\|O like\|O London\|B-GPE and\|O New\|B-GPE York\|I-GPE City\|I-GPE .\|O",`
			`"I\|PRP\|O like\|VBP\|O London\|NNP\|I-GPE and\|CC\|O New\|NNP\|B-GPE York\|NNP\|I-GPE City\|NNP\|I-GPE .\|.\|O",`
			`"I\|PRP\|O like\|VBP\|O London\|NNP\|B-GPE and\|CC\|O New\|NNP\|B-GPE York\|NNP\|I-GPE City\|NNP\|I-GPE .\|.\|O",`
			`]`
			`input_data = "\n".join(lines)`
			`converted = iob2json(input_data, n_sents=10)`
			`assert len(converted) == 1`
			`assert converted[0]["id"] == 0`
			`assert len(converted[0]["paragraphs"]) == 1`
			`assert len(converted[0]["paragraphs"][0]["sentences"]) == 4`
			`for i in range(0, 4):`
			`sent = converted[0]["paragraphs"][0]["sentences"][i]`
			`assert len(sent["tokens"]) == 8`
			`tokens = sent["tokens"]`
			`assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]`
			`assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]`


			`def test_cli_converters_conll_ner2json():`
			`lines = [`
			`"-DOCSTART- -X- O O",`
			`"",`
			`"I\tO",`
			`"like\tO",`
			`"London\tB-GPE",`
			`"and\tO",`
			`"New\tB-GPE",`
			`"York\tI-GPE",`
			`"City\tI-GPE",`
			`".\tO",`
			`"",`
			`"I O",`
			`"like O",`
			`"London B-GPE",`
			`"and O",`
			`"New B-GPE",`
			`"York I-GPE",`
			`"City I-GPE",`
			`". O",`
			`"",`
			`"I PRP O",`
			`"like VBP O",`
			`"London NNP B-GPE",`
			`"and CC O",`
			`"New NNP B-GPE",`
			`"York NNP I-GPE",`
			`"City NNP I-GPE",`
			`". . O",`
			`"",`
			`"I PRP _ O",`
			`"like VBP _ O",`
			`"London NNP _ B-GPE",`
			`"and CC _ O",`
			`"New NNP _ B-GPE",`
			`"York NNP _ I-GPE",`
			`"City NNP _ I-GPE",`
			`". . _ O",`
			`"",`
			`"I\tPRP\t_\tO",`
			`"like\tVBP\t_\tO",`
			`"London\tNNP\t_\tB-GPE",`
			`"and\tCC\t_\tO",`
			`"New\tNNP\t_\tB-GPE",`
			`"York\tNNP\t_\tI-GPE",`
			`"City\tNNP\t_\tI-GPE",`
			`".\t.\t_\tO",`
			`]`
			`input_data = "\n".join(lines)`
			`converted = conll_ner2json(input_data, n_sents=10)`
			`print(converted)`
			`assert len(converted) == 1`
			`assert converted[0]["id"] == 0`
			`assert len(converted[0]["paragraphs"]) == 1`
			`assert len(converted[0]["paragraphs"][0]["sentences"]) == 5`
			`for i in range(0, 5):`
			`sent = converted[0]["paragraphs"][0]["sentences"][i]`
			`assert len(sent["tokens"]) == 8`
			`tokens = sent["tokens"]`
			`assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]`
			`assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]`


Update error raising for CLI pretrain to fix #3840 (#3843) * Add check for empty input file to CLI pretrain * Raise error if JSONL is not a dict or contains neither `tokens` nor `text` key * Skip empty values for correct pretrain keys and log a counter as warning * Add tests for CLI pretrain core function make_docs. * Add a short hint for the `tokens` key to the CLI pretrain docs * Add success message to CLI pretrain * Update model loading to fix the tests * Skip empty values and do not create docs out of it 2019-06-16 11:22:57 +00:00			`def test_pretrain_make_docs():`
			`nlp = English()`

			`valid_jsonl_text = {"text": "Some text"}`
			`docs, skip_count = make_docs(nlp, [valid_jsonl_text], 1, 10)`
			`assert len(docs) == 1`
			`assert skip_count == 0`

			`valid_jsonl_tokens = {"tokens": ["Some", "tokens"]}`
			`docs, skip_count = make_docs(nlp, [valid_jsonl_tokens], 1, 10)`
			`assert len(docs) == 1`
			`assert skip_count == 0`

			`invalid_jsonl_type = 0`
			`with pytest.raises(TypeError):`
			`make_docs(nlp, [invalid_jsonl_type], 1, 100)`

			`invalid_jsonl_key = {"invalid": "Does not matter"}`
			`with pytest.raises(ValueError):`
			`make_docs(nlp, [invalid_jsonl_key], 1, 100)`

			`empty_jsonl_text = {"text": ""}`
			`docs, skip_count = make_docs(nlp, [empty_jsonl_text], 1, 10)`
			`assert len(docs) == 0`
			`assert skip_count == 1`

			`empty_jsonl_tokens = {"tokens": []}`
			`docs, skip_count = make_docs(nlp, [empty_jsonl_tokens], 1, 10)`
			`assert len(docs) == 0`
			`assert skip_count == 1`

			`too_short_jsonl = {"text": "This text is not long enough"}`
			`docs, skip_count = make_docs(nlp, [too_short_jsonl], 10, 15)`
			`assert len(docs) == 0`
			`assert skip_count == 0`

			`too_long_jsonl = {"text": "This text contains way too much tokens for this test"}`
			`docs, skip_count = make_docs(nlp, [too_long_jsonl], 1, 5)`
			`assert len(docs) == 0`
			`assert skip_count == 0`