From 21af12eb533365ff4fd711187d062dab0eb0abab Mon Sep 17 00:00:00 2001 From: devforfu Date: Sat, 11 May 2019 18:41:29 +0500 Subject: [PATCH] Make "text" key in JSONL format optional when "tokens" key is provided (#3721) * Fix issue with forcing text key when it is not required * Extending the docs to reflect the new behavior --- spacy/cli/pretrain.py | 2 +- website/docs/api/cli.md | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index ef91937a6..b2c22d929 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -181,10 +181,10 @@ def make_update(model, docs, optimizer, drop=0.0, objective="L2"): def make_docs(nlp, batch, min_length, max_length): docs = [] for record in batch: - text = record["text"] if "tokens" in record: doc = Doc(nlp.vocab, words=record["tokens"]) else: + text = record["text"] doc = nlp.make_doc(text) if "heads" in record: heads = record["heads"] diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index ef3e6f84a..7788d7a8f 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -327,7 +327,7 @@ tokenization can be provided. | Key | Type | Description | | -------- | ------- | -------------------------------------------- | -| `text` | unicode | The raw input text. | +| `text` | unicode | The raw input text. Is not required if `tokens` available. | | `tokens` | list | Optional tokenization, one string per token. | ```json @@ -335,6 +335,7 @@ tokenization can be provided. {"text": "Can I ask where you work now and what you do, and if you enjoy it?"} {"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."} {"text": "My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in."} +{"tokens": ["If", "tokens", "are", "provided", "then", "we", "can", "skip", "the", "raw", "input", "text"]} ``` ## Init Model {#init-model new="2"}