From c52fde40f49780077e92cbe4869caa9ba29cfc06 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 4 Jun 2017 20:18:37 -0500
Subject: [PATCH] Improve train CLI

---
 spacy/cli/train.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 61278e2a3..af028dae5 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -18,6 +18,7 @@ from ..gold import GoldCorpus, minibatch
 from ..util import prints
 from .. import util
 from .. import displacy
+from ..compat import json_dumps
 
 
 @plac.annotations(
@@ -44,7 +45,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
     train_path = util.ensure_path(train_data)
     dev_path = util.ensure_path(dev_data)
     if not output_path.exists():
-        prints(output_path, title="Output directory not found", exits=1)
+        output_path.mkdir()
     if not train_path.exists():
         prints(train_path, title="Training data not found", exits=1)
     if dev_path and not dev_path.exists():
@@ -74,7 +75,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
     else:
         nlp = lang_class(pipeline=pipeline)
     corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
-    n_train_docs = corpus.count_train()
+    n_train_words = corpus.count_train()
 
     optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
 
@@ -83,7 +84,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
         for i in range(n_iter):
             if resume:
                 i += 20
-            with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar:
+            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                 train_docs = corpus.train_docs(nlp, projectivize=True,
                                                gold_preproc=False, max_length=0)
                 losses = {}
@@ -91,7 +92,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                     docs, golds = zip(*batch)
                     nlp.update(docs, golds, sgd=optimizer,
                                drop=next(dropout_rates), losses=losses)
-                    pbar.update(len(docs))
+                    pbar.update(sum(len(doc) for doc in docs))
 
             with nlp.use_params(optimizer.averages):
                 util.set_env_log(False)
@@ -105,6 +106,9 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                             corpus.dev_docs(
                                 nlp_loaded,
                                 gold_preproc=False))
+                acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
+                with acc_loc.open('w') as file_:
+                    file_.write(json_dumps(scorer.scores))
                 util.set_env_log(True)
             print_progress(i, losses, scorer.scores)
     finally: