From 1040e250d8f740db7d0a6b012962b25ce7f95ffb Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 16 Sep 2020 16:41:28 +0200 Subject: [PATCH] actual commit with test for custom readers with ml_datasets >= 0.2 --- requirements.txt | 2 +- spacy/tests/training/test_readers.py | 58 ++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/training/test_readers.py diff --git a/requirements.txt b/requirements.txt index db6eae2ef..a67ade640 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 thinc>=8.0.0a31,<8.0.0a40 blis>=0.4.0,<0.5.0 -ml_datasets>=0.1.1 +ml_datasets>=0.2.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.8.0,<1.1.0 srsly>=2.1.0,<3.0.0 diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py new file mode 100644 index 000000000..c81ec0897 --- /dev/null +++ b/spacy/tests/training/test_readers.py @@ -0,0 +1,58 @@ +import pytest +from thinc.api import Config +from spacy.util import load_model_from_config + + +@pytest.mark.slow +@pytest.mark.parametrize( + "reader,additional_config", + [ + ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 2}), + ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 2}), + ("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}), + ], +) +def test_cat_readers(reader, additional_config): + nlp_config_string = """ + [training] + + [training.corpus] + @readers = "PLACEHOLDER" + + [nlp] + lang = "en" + pipeline = ["tok2vec", "textcat"] + + [components] + + [components.tok2vec] + factory = "tok2vec" + + [components.textcat] + factory = "textcat" + """ + config = Config().from_str(nlp_config_string) + config["training"]["corpus"]["@readers"] = reader + config["training"]["corpus"].update(additional_config) + nlp, resolved = load_model_from_config(config, auto_fill=True) + + train_corpus = resolved["training"]["corpus"]["train"] + optimizer = resolved["training"]["optimizer"] + # simulate a training loop + nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer) + for example in train_corpus(nlp): + assert example.y.cats + # this shouldn't fail if each training example has at least one positive label + assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0] + nlp.update([example], sgd=optimizer) + # simulate performance benchmark on dev corpus + dev_corpus = resolved["training"]["corpus"]["dev"] + dev_examples = list(dev_corpus(nlp)) + for example in dev_examples: + # this shouldn't fail if each dev example has at least one positive label + assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0] + scores = nlp.evaluate(dev_examples) + assert scores["cats_score"] + # ensure the pipeline runs + doc = nlp("Quick test") + assert doc.cats