From d8573ee715701cc7227378d08bb4b7e3bc62c030 Mon Sep 17 00:00:00 2001
From: BreakBB <33514570+BreakBB@users.noreply.github.com>
Date: Sun, 16 Jun 2019 13:22:57 +0200
Subject: [PATCH] Update error raising for CLI pretrain to fix #3840 (#3843)

* Add check for empty input file to CLI pretrain

* Raise error if JSONL is not a dict or contains neither `tokens` nor `text` key

* Skip empty values for correct pretrain keys and log a counter as warning

* Add tests for CLI pretrain core function make_docs.

* Add a short hint for the `tokens` key to the CLI pretrain docs

* Add success message to CLI pretrain

* Update model loading to fix the tests

* Skip empty values and do not create docs out of it
---
 spacy/cli/pretrain.py   | 28 +++++++++++++++++++++----
 spacy/errors.py         |  6 ++++++
 spacy/tests/test_cli.py | 46 +++++++++++++++++++++++++++++++++++++++++
 website/docs/api/cli.md |  2 +-
 4 files changed, 77 insertions(+), 5 deletions(-)

diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 18fe0598f..be8733c62 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -13,6 +13,7 @@ from thinc.neural.util import prefer_gpu, get_array_module
 from wasabi import Printer
 import srsly
 
+from ..errors import Errors
 from ..tokens import Doc
 from ..attrs import ID, HEAD
 from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
@@ -101,6 +102,8 @@ def pretrain(
             msg.fail("Input text file doesn't exist", texts_loc, exits=1)
         with msg.loading("Loading input texts..."):
             texts = list(srsly.read_jsonl(texts_loc))
+        if not texts:
+            msg.fail("Input file is empty", texts_loc, exits=1)
         msg.good("Loaded input texts")
         random.shuffle(texts)
     else:  # reading from stdin
@@ -149,16 +152,18 @@ def pretrain(
             with (output_dir / "log.jsonl").open("a") as file_:
                 file_.write(srsly.json_dumps(log) + "\n")
 
+    skip_counter = 0
     for epoch in range(n_iter):
         for batch_id, batch in enumerate(
             util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
         ):
-            docs = make_docs(
+            docs, count = make_docs(
                 nlp,
                 [text for (text, _) in batch],
                 max_length=max_length,
                 min_length=min_length,
             )
+            skip_counter += count
             loss = make_update(
                 model, docs, optimizer, objective=loss_func, drop=dropout
             )
@@ -174,6 +179,9 @@ def pretrain(
         if texts_loc != "-":
             # Reshuffle the texts if texts were loaded from a file
             random.shuffle(texts)
+    if skip_counter > 0:
+        msg.warn("Skipped {count} empty values".format(count=str(skip_counter)))
+    msg.good("Successfully finished pretrain")
 
 
 def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
@@ -195,12 +203,24 @@ def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
 
 def make_docs(nlp, batch, min_length, max_length):
     docs = []
+    skip_count = 0
     for record in batch:
+        if not isinstance(record, dict):
+            raise TypeError(Errors.E137.format(type=type(record), line=record))
         if "tokens" in record:
-            doc = Doc(nlp.vocab, words=record["tokens"])
-        else:
+            words = record["tokens"]
+            if not words:
+                skip_count += 1
+                continue
+            doc = Doc(nlp.vocab, words=words)
+        elif "text" in record:
             text = record["text"]
+            if not text:
+                skip_count += 1
+                continue
             doc = nlp.make_doc(text)
+        else:
+            raise ValueError(Errors.E138.format(text=record))
         if "heads" in record:
             heads = record["heads"]
             heads = numpy.asarray(heads, dtype="uint64")
@@ -208,7 +228,7 @@ def make_docs(nlp, batch, min_length, max_length):
             doc = doc.from_array([HEAD], heads)
         if len(doc) >= min_length and len(doc) < max_length:
             docs.append(doc)
-    return docs
+    return docs, skip_count
 
 
 def get_vectors_loss(ops, docs, prediction, objective="L2"):
diff --git a/spacy/errors.py b/spacy/errors.py
index 3a1e05e05..fcc3132c6 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -393,6 +393,12 @@ class Errors(object):
             "`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`")
     E136 = ("This additional feature requires the jsonschema library to be "
             "installed:\npip install jsonschema")
+    E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure to provide a valid JSON "
+            "object as input with either the `text` or `tokens` key. For more info, see the docs:\n"
+            "https://spacy.io/api/cli#pretrain-jsonl")
+    E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input includes either the "
+            "`text` or `tokens` key. For more info, see the docs:\n"
+            "https://spacy.io/api/cli#pretrain-jsonl")
 
 
 @add_codes
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 2afa6a71b..f0c34276d 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,7 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import pytest
+
+from spacy.lang.en import English
 from spacy.cli.converters import conllu2json
+from spacy.cli.pretrain import make_docs
 
 
 def test_cli_converters_conllu2json():
@@ -26,3 +30,45 @@ def test_cli_converters_conllu2json():
     assert [t["head"] for t in tokens] == [1, 2, -1, 0]
     assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"]
     assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
+
+
+def test_pretrain_make_docs():
+    nlp = English()
+
+    valid_jsonl_text = {"text": "Some text"}
+    docs, skip_count = make_docs(nlp, [valid_jsonl_text], 1, 10)
+    assert len(docs) == 1
+    assert skip_count == 0
+
+    valid_jsonl_tokens = {"tokens": ["Some", "tokens"]}
+    docs, skip_count = make_docs(nlp, [valid_jsonl_tokens], 1, 10)
+    assert len(docs) == 1
+    assert skip_count == 0
+
+    invalid_jsonl_type = 0
+    with pytest.raises(TypeError):
+        make_docs(nlp, [invalid_jsonl_type], 1, 100)
+
+    invalid_jsonl_key = {"invalid": "Does not matter"}
+    with pytest.raises(ValueError):
+        make_docs(nlp, [invalid_jsonl_key], 1, 100)
+
+    empty_jsonl_text = {"text": ""}
+    docs, skip_count = make_docs(nlp, [empty_jsonl_text], 1, 10)
+    assert len(docs) == 0
+    assert skip_count == 1
+
+    empty_jsonl_tokens = {"tokens": []}
+    docs, skip_count = make_docs(nlp, [empty_jsonl_tokens], 1, 10)
+    assert len(docs) == 0
+    assert skip_count == 1
+
+    too_short_jsonl = {"text": "This text is not long enough"}
+    docs, skip_count = make_docs(nlp, [too_short_jsonl], 10, 15)
+    assert len(docs) == 0
+    assert skip_count == 0
+
+    too_long_jsonl = {"text": "This text contains way too much tokens for this test"}
+    docs, skip_count = make_docs(nlp, [too_long_jsonl], 1, 5)
+    assert len(docs) == 0
+    assert skip_count == 0
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 8b982309a..ac4c7eddb 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -291,7 +291,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] [--width]
 
 | Argument                | Type       | Description                                                                                                                       |
 | ----------------------- | ---------- | --------------------------------------------------------------------------------------------------------------------------------- |
-| `texts_loc`             | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"`. [See here](#pretrain-jsonl) for details. |
+| `texts_loc`             | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `tokens`. [See here](#pretrain-jsonl) for details. |
 | `vectors_model`         | positional | Name or path to spaCy model with vectors to learn from.                                                                           |
 | `output_dir`            | positional | Directory to write models to on each epoch.                                                                                       |
 | `--width`, `-cw`        | option     | Width of CNN layers.                                                                                                              |