spaCy/spacy/cli/train.py

# coding: utf8
from __future__ import unicode_literals, division, print_function

import plac
from pathlib import Path
import dill
import tqdm
from thinc.neural._classes.model import Model
from timeit import default_timer as timer
import random
import numpy.random

from ..gold import GoldCorpus, minibatch
from ..util import prints
from .. import util
from .. import about
from .. import displacy
from ..compat import json_dumps

random.seed(0)
numpy.random.seed(0)


@plac.annotations(
    lang=("model language", "positional", None, str),
    output_dir=("output directory to store model in", "positional", None, str),
    train_data=("location of JSON-formatted training data", "positional",
                None, str),
    dev_data=("location of JSON-formatted development data (optional)",
              "positional", None, str),
    n_iter=("number of iterations", "option", "n", int),
    n_sents=("number of sentences", "option", "ns", int),
    use_gpu=("Use GPU", "option", "g", int),
    vectors=("Model to load vectors from", "option", "v"),
    no_tagger=("Don't train tagger", "flag", "T", bool),
    no_parser=("Don't train parser", "flag", "P", bool),
    no_entities=("Don't train NER", "flag", "N", bool),
    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
    version=("Model version", "option", "V", str),
    meta_path=("Optional path to meta.json. All relevant properties will be "
               "overwritten.", "option", "m", Path))
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
          use_gpu=-1, vectors=None, no_tagger=False,
          no_parser=False, no_entities=False, gold_preproc=False,
          version="0.0.0", meta_path=None):
    """
    Train a model. Expects data in spaCy's JSON format.
    """
    util.set_env_log(True)
    n_sents = n_sents or None
    output_path = util.ensure_path(output_dir)
    train_path = util.ensure_path(train_data)
    dev_path = util.ensure_path(dev_data)
    meta_path = util.ensure_path(meta_path)
    if not output_path.exists():
        output_path.mkdir()
    if not train_path.exists():
        prints(train_path, title="Training data not found", exits=1)
    if dev_path and not dev_path.exists():
        prints(dev_path, title="Development data not found", exits=1)
    if meta_path is not None and not meta_path.exists():
        prints(meta_path, title="meta.json not found", exits=1)
    meta = util.read_json(meta_path) if meta_path else {}
    if not isinstance(meta, dict):
        prints("Expected dict but got: {}".format(type(meta)),
               title="Not a valid meta.json format", exits=1)
    meta.setdefault('lang', lang)
    meta.setdefault('name', 'unnamed')

    pipeline = ['tagger', 'parser', 'ner']
    if no_tagger and 'tagger' in pipeline:
        pipeline.remove('tagger')
    if no_parser and 'parser' in pipeline:
        pipeline.remove('parser')
    if no_entities and 'ner' in pipeline:
        pipeline.remove('ner')

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
                                  util.env_opt('dropout_to', 0.2),
                                  util.env_opt('dropout_decay', 0.0))
    batch_sizes = util.compounding(util.env_opt('batch_from', 1),
                                   util.env_opt('batch_to', 16),
                                   util.env_opt('batch_compound', 1.001))
    max_doc_len = util.env_opt('max_doc_len', 5000)
    corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
    n_train_words = corpus.count_train()

    lang_class = util.get_lang_class(lang)
    nlp = lang_class()
    meta['pipeline'] = pipeline
    nlp.meta.update(meta)
    if vectors:
        util.load_model(vectors, vocab=nlp.vocab)
    for name in pipeline:
        nlp.add_pipe(nlp.create_pipe(name), name=name)
    optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
    nlp._optimizer = None

    print("Itn.\tP.Loss\tN.Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
    try:
        train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
                                       gold_preproc=gold_preproc, max_length=0)
        train_docs = list(train_docs)
        for i in range(n_iter):
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in minibatch(train_docs, size=batch_sizes):
                    batch = [(d, g) for (d, g) in batch if len(d) < max_doc_len]
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    nlp.update(docs, golds, sgd=optimizer,
                               drop=next(dropout_rates), losses=losses)
                    pbar.update(sum(len(doc) for doc in docs))

            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ('model%d' % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
                dev_docs = list(corpus.dev_docs(
                                nlp_loaded,
                                gold_preproc=gold_preproc))
                nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                start_time = timer()
                scorer = nlp_loaded.evaluate(dev_docs)
                end_time = timer()
                if use_gpu < 0:
                    gpu_wps = None
                    cpu_wps = nwords/(end_time-start_time)
                else:
                    gpu_wps = nwords/(end_time-start_time)
                    with Model.use_device('cpu'):
                        nlp_loaded = util.load_model_from_path(epoch_model_path)
                        dev_docs = list(corpus.dev_docs(
                                        nlp_loaded, gold_preproc=gold_preproc))
                        start_time = timer()
                        scorer = nlp_loaded.evaluate(dev_docs)
                        end_time = timer()
                        cpu_wps = nwords/(end_time-start_time)
                acc_loc = (output_path / ('model%d' % i) / 'accuracy.json')
                with acc_loc.open('w') as file_:
                    file_.write(json_dumps(scorer.scores))
                meta_loc = output_path / ('model%d' % i) / 'meta.json'
                meta['accuracy'] = scorer.scores
                meta['speed'] = {'nwords': nwords, 'cpu': cpu_wps,
                                 'gpu': gpu_wps}
                meta['vectors'] = {'width': nlp.vocab.vectors_length,
                                   'vectors': len(nlp.vocab.vectors),
                                   'keys': nlp.vocab.vectors.n_keys}
                meta['lang'] = nlp.lang
                meta['pipeline'] = pipeline
                meta['spacy_version'] = '>=%s' % about.__version__
                meta.setdefault('name', 'model%d' % i)
                meta.setdefault('version', version)

                with meta_loc.open('w') as file_:
                    file_.write(json_dumps(meta))
                util.set_env_log(True)
            print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps,
                           gpu_wps=gpu_wps)
    finally:
        print("Saving model...")
        try:
            with (output_path / 'model-final.pickle').open('wb') as file_:
                with nlp.use_params(optimizer.averages):
                    dill.dump(nlp, file_, -1)
        except:
            print("Error saving model")


def _render_parses(i, to_render):
    to_render[0].user_data['title'] = "Batch %d" % i
    with Path('/tmp/entities.html').open('w') as file_:
        html = displacy.render(to_render[:5], style='ent', page=True)
        file_.write(html)
    with Path('/tmp/parses.html').open('w') as file_:
        html = displacy.render(to_render[:5], style='dep', page=True)
        file_.write(html)


def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0):
    scores = {}
    for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
                'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']:
        scores[col] = 0.0
    scores['dep_loss'] = losses.get('parser', 0.0)
    scores['ner_loss'] = losses.get('ner', 0.0)
    scores['tag_loss'] = losses.get('tagger', 0.0)
    scores.update(dev_scores)
    scores['cpu_wps'] = cpu_wps
    scores['gpu_wps'] = gpu_wps or 0.0
    tpl = '\t'.join((
        '{:d}',
        '{dep_loss:.3f}',
        '{ner_loss:.3f}',
        '{uas:.3f}',
        '{ents_p:.3f}',
        '{ents_r:.3f}',
        '{ents_f:.3f}',
        '{tags_acc:.3f}',
        '{token_acc:.3f}',
        '{cpu_wps:.1f}',
        '{gpu_wps:.1f}',
    ))
    print(tpl.format(itn, **scores))


def print_results(scorer):
    results = {
        'TOK': '%.2f' % scorer.token_acc,
        'POS': '%.2f' % scorer.tags_acc,
        'UAS': '%.2f' % scorer.uas,
        'LAS': '%.2f' % scorer.las,
        'NER P': '%.2f' % scorer.ents_p,
        'NER R': '%.2f' % scorer.ents_r,
        'NER F': '%.2f' % scorer.ents_f}
    util.print_table(results, title="Results")
Add spacy train work in progress 2017-03-23 10:08:41 +00:00			`# coding: utf8`
			`from __future__ import unicode_literals, division, print_function`

Reduce complexity in CLI Remove now redundant model command and move plac annotations to cli files 2017-05-22 10:28:58 +00:00			`import plac`
Get spaCy train command working with neural network * Integrate models into pipeline * Add basic serialization (maybe incorrect) * Fix pickle on vocab 2017-05-17 10:04:50 +00:00			`from pathlib import Path`
			`import dill`
Refactor training, to fix memory leak 2017-05-21 14:07:06 +00:00			`import tqdm`
Refactor train script 2017-09-21 00:17:10 +00:00			`from thinc.neural._classes.model import Model`
Revert "Revert "WIP on improving parser efficiency"" This reverts commit 532afef4a811d5c71c75f5e63fbec3232f6ea937. 2017-05-23 08:06:53 +00:00			`from timeit import default_timer as timer`
Set random seed in train script 2017-09-23 00:57:31 +00:00			`import random`
			`import numpy.random`
Add spacy train work in progress 2017-03-23 10:08:41 +00:00
Clean up spacy.cli.train 2017-05-25 21:16:30 +00:00			`from ..gold import GoldCorpus, minibatch`
Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`from ..util import prints`
Add spacy train work in progress 2017-03-23 10:08:41 +00:00			`from .. import util`
Add meta.json option to cli.train and add relevant properties Add accuracy scores to meta.json instead of accuracy.json and replace all relevant properties like lang, pipeline, spacy_version in existing meta.json. If not present, also add name and version placeholders to make it packagable. 2017-09-25 17:00:47 +00:00			`from .. import about`
Add util.env_opt support: Can set hyper params through environment variables. 2017-05-18 09:36:53 +00:00			`from .. import displacy`
Improve train CLI 2017-06-05 01:18:37 +00:00			`from ..compat import json_dumps`
Add spacy train work in progress 2017-03-23 10:08:41 +00:00
Set random seed in train script 2017-09-23 00:57:31 +00:00			`random.seed(0)`
			`numpy.random.seed(0)`

Add spacy train work in progress 2017-03-23 10:08:41 +00:00
Reduce complexity in CLI Remove now redundant model command and move plac annotations to cli files 2017-05-22 10:28:58 +00:00			`@plac.annotations(`
			`lang=("model language", "positional", None, str),`
			`output_dir=("output directory to store model in", "positional", None, str),`
Tidy up CLI 2017-10-27 12:38:39 +00:00			`train_data=("location of JSON-formatted training data", "positional",`
			`None, str),`
			`dev_data=("location of JSON-formatted development data (optional)",`
			`"positional", None, str),`
Reduce complexity in CLI Remove now redundant model command and move plac annotations to cli files 2017-05-22 10:28:58 +00:00			`n_iter=("number of iterations", "option", "n", int),`
			`n_sents=("number of sentences", "option", "ns", int),`
Support specifying which GPU 2017-06-03 21:10:23 +00:00			`use_gpu=("Use GPU", "option", "g", int),`
Fix training with preset vectors 2017-09-23 01:00:40 +00:00			`vectors=("Model to load vectors from", "option", "v"),`
Reduce complexity in CLI Remove now redundant model command and move plac annotations to cli files 2017-05-22 10:28:58 +00:00			`no_tagger=("Don't train tagger", "flag", "T", bool),`
			`no_parser=("Don't train parser", "flag", "P", bool),`
Add gold_preproc flag to cli/train 2017-08-20 16:07:00 +00:00			`no_entities=("Don't train NER", "flag", "N", bool),`
			`gold_preproc=("Use gold preprocessing", "flag", "G", bool),`
Fix option shortcut to avoid conflict 2017-09-26 15:59:34 +00:00			`version=("Model version", "option", "V", str),`
Tidy up CLI 2017-10-27 12:38:39 +00:00			`meta_path=("Optional path to meta.json. All relevant properties will be "`
			`"overwritten.", "option", "m", Path))`
Increase default number of epochs 2017-10-12 11:13:01 +00:00			`def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,`
Remove vector pruning arg from train CLI 2017-10-31 18:21:05 +00:00			`use_gpu=-1, vectors=None, no_tagger=False,`
Add -prune-vectors argument to spacy.cly.train 2017-10-30 17:00:10 +00:00			`no_parser=False, no_entities=False, gold_preproc=False,`
			`version="0.0.0", meta_path=None):`
Fix CLI docstrings and add command as first argument Workaround for Plac 2017-05-27 18:01:46 +00:00			`"""`
			`Train a model. Expects data in spaCy's JSON format.`
			`"""`
Improve train CLI script 2017-06-03 18:28:20 +00:00			`util.set_env_log(True)`
Reduce complexity in CLI Remove now redundant model command and move plac annotations to cli files 2017-05-22 10:28:58 +00:00			`n_sents = n_sents or None`
Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`output_path = util.ensure_path(output_dir)`
			`train_path = util.ensure_path(train_data)`
			`dev_path = util.ensure_path(dev_data)`
Add meta.json option to cli.train and add relevant properties Add accuracy scores to meta.json instead of accuracy.json and replace all relevant properties like lang, pipeline, spacy_version in existing meta.json. If not present, also add name and version placeholders to make it packagable. 2017-09-25 17:00:47 +00:00			`meta_path = util.ensure_path(meta_path)`
Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`if not output_path.exists():`
Improve train CLI 2017-06-05 01:18:37 +00:00			`output_path.mkdir()`
Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`if not train_path.exists():`
Reduce complexity in CLI Remove now redundant model command and move plac annotations to cli files 2017-05-22 10:28:58 +00:00			`prints(train_path, title="Training data not found", exits=1)`
Tidy up CLI and fix print functions 2017-05-07 21:25:29 +00:00			`if dev_path and not dev_path.exists():`
Reduce complexity in CLI Remove now redundant model command and move plac annotations to cli files 2017-05-22 10:28:58 +00:00			`prints(dev_path, title="Development data not found", exits=1)`
Add meta.json option to cli.train and add relevant properties Add accuracy scores to meta.json instead of accuracy.json and replace all relevant properties like lang, pipeline, spacy_version in existing meta.json. If not present, also add name and version placeholders to make it packagable. 2017-09-25 17:00:47 +00:00			`if meta_path is not None and not meta_path.exists():`
			`prints(meta_path, title="meta.json not found", exits=1)`
			`meta = util.read_json(meta_path) if meta_path else {}`
			`if not isinstance(meta, dict):`
			`prints("Expected dict but got: {}".format(type(meta)),`
			`title="Not a valid meta.json format", exits=1)`
Add default name and lang to meta 2017-10-11 06:49:12 +00:00			`meta.setdefault('lang', lang)`
			`meta.setdefault('name', 'unnamed')`
Add spacy train work in progress 2017-03-23 10:08:41 +00:00
Update pipeline component names in spaCy train 2017-10-02 15:20:19 +00:00			`pipeline = ['tagger', 'parser', 'ner']`
Tidy up CLI 2017-10-27 12:38:39 +00:00			`if no_tagger and 'tagger' in pipeline:`
			`pipeline.remove('tagger')`
			`if no_parser and 'parser' in pipeline:`
			`pipeline.remove('parser')`
			`if no_entities and 'ner' in pipeline:`
			`pipeline.remove('ner')`
Add spacy train work in progress 2017-03-23 10:08:41 +00:00
Clean up spacy.cli.train 2017-05-25 21:16:30 +00:00			`# Take dropout and batch size as generators of values -- dropout`
			`# starts high and decays sharply, to force the optimizer to explore.`
			`# Batch size starts at 1 and grows, so that we make updates quickly`
			`# at the beginning of training.`
Update defaults 2017-10-08 07:08:12 +00:00			`dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),`
			`util.env_opt('dropout_to', 0.2),`
			`util.env_opt('dropout_decay', 0.0))`
Clean up spacy.cli.train 2017-05-25 21:16:30 +00:00			`batch_sizes = util.compounding(util.env_opt('batch_from', 1),`
Update defaults 2017-10-08 07:08:12 +00:00			`util.env_opt('batch_to', 16),`
Clean up spacy.cli.train 2017-05-25 21:16:30 +00:00			`util.env_opt('batch_compound', 1.001))`
Add document length cap for training 2017-11-03 00:54:54 +00:00			`max_doc_len = util.env_opt('max_doc_len', 5000)`
Fix train command line args 2017-05-22 15:41:39 +00:00			`corpus = GoldCorpus(train_path, dev_path, limit=n_sents)`
Improve train CLI 2017-06-05 01:18:37 +00:00			`n_train_words = corpus.count_train()`
Add spacy train work in progress 2017-03-23 10:08:41 +00:00
Fix training with preset vectors 2017-09-23 01:00:40 +00:00			`lang_class = util.get_lang_class(lang)`
Patch spacy.train for new pipeline management 2017-10-10 04:41:16 +00:00			`nlp = lang_class()`
Fix metadata in training 2017-10-11 06:55:52 +00:00			`meta['pipeline'] = pipeline`
			`nlp.meta.update(meta)`
Fix training with preset vectors 2017-09-23 01:00:40 +00:00			`if vectors:`
			`util.load_model(vectors, vocab=nlp.vocab)`
Patch spacy.train for new pipeline management 2017-10-10 04:41:16 +00:00			`for name in pipeline:`
			`nlp.add_pipe(nlp.create_pipe(name), name=name)`
Support specifying which GPU 2017-06-03 21:10:23 +00:00			`optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)`
Fix pickle during train 2017-09-02 17:46:01 +00:00			`nlp._optimizer = None`
Clean up spacy.cli.train 2017-05-25 21:16:30 +00:00
Print NER loss 2017-09-28 13:05:31 +00:00			`print("Itn.\tP.Loss\tN.Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")`
Improve model saving in train script 2017-05-26 10:52:09 +00:00			`try:`
Fix training with preset vectors 2017-09-23 01:00:40 +00:00			`train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,`
			`gold_preproc=gold_preproc, max_length=0)`
			`train_docs = list(train_docs)`
Improve model saving in train script 2017-05-26 10:52:09 +00:00			`for i in range(n_iter):`
Improve train CLI 2017-06-05 01:18:37 +00:00			`with tqdm.tqdm(total=n_train_words, leave=False) as pbar:`
Improve model saving in train script 2017-05-26 10:52:09 +00:00			`losses = {}`
			`for batch in minibatch(train_docs, size=batch_sizes):`
Add document length cap for training 2017-11-03 00:54:54 +00:00			`batch = [(d, g) for (d, g) in batch if len(d) < max_doc_len]`
			`if not batch:`
			`continue`
Improve model saving in train script 2017-05-26 10:52:09 +00:00			`docs, golds = zip(*batch)`
			`nlp.update(docs, golds, sgd=optimizer,`
Remove unused update_shared argument 2017-09-24 10:00:37 +00:00			`drop=next(dropout_rates), losses=losses)`
Improve train CLI 2017-06-05 01:18:37 +00:00			`pbar.update(sum(len(doc) for doc in docs))`
Clean up spacy.cli.train 2017-05-25 21:16:30 +00:00
Improve model saving in train script 2017-05-26 10:52:09 +00:00			`with nlp.use_params(optimizer.averages):`
Improve train CLI script 2017-06-03 18:28:20 +00:00			`util.set_env_log(False)`
			`epoch_model_path = output_path / ('model%d' % i)`
			`nlp.to_disk(epoch_model_path)`
Make loading code more consistent in train command 2017-10-10 17:51:20 +00:00			`nlp_loaded = util.load_model_from_path(epoch_model_path)`
Add speed benchmarks to metadata 2017-10-09 13:05:37 +00:00			`dev_docs = list(corpus.dev_docs(`
Improve train CLI script 2017-06-03 18:28:20 +00:00			`nlp_loaded,`
Add speed benchmarks to metadata 2017-10-09 13:05:37 +00:00			`gold_preproc=gold_preproc))`
			`nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)`
			`start_time = timer()`
			`scorer = nlp_loaded.evaluate(dev_docs)`
			`end_time = timer()`
			`if use_gpu < 0:`
			`gpu_wps = None`
			`cpu_wps = nwords/(end_time-start_time)`
			`else:`
			`gpu_wps = nwords/(end_time-start_time)`
			`with Model.use_device('cpu'):`
Make loading code more consistent in train command 2017-10-10 17:51:20 +00:00			`nlp_loaded = util.load_model_from_path(epoch_model_path)`
Add speed benchmarks to metadata 2017-10-09 13:05:37 +00:00			`dev_docs = list(corpus.dev_docs(`
			`nlp_loaded, gold_preproc=gold_preproc))`
			`start_time = timer()`
			`scorer = nlp_loaded.evaluate(dev_docs)`
			`end_time = timer()`
			`cpu_wps = nwords/(end_time-start_time)`
Tidy up CLI 2017-10-27 12:38:39 +00:00			`acc_loc = (output_path / ('model%d' % i) / 'accuracy.json')`
Improve train CLI 2017-06-05 01:18:37 +00:00			`with acc_loc.open('w') as file_:`
			`file_.write(json_dumps(scorer.scores))`
Add meta.json option to cli.train and add relevant properties Add accuracy scores to meta.json instead of accuracy.json and replace all relevant properties like lang, pipeline, spacy_version in existing meta.json. If not present, also add name and version placeholders to make it packagable. 2017-09-25 17:00:47 +00:00			`meta_loc = output_path / ('model%d' % i) / 'meta.json'`
			`meta['accuracy'] = scorer.scores`
Add vector data to model meta after training (see #1457) 2017-10-25 14:03:05 +00:00			`meta['speed'] = {'nwords': nwords, 'cpu': cpu_wps,`
			`'gpu': gpu_wps}`
Fix vectors data added after training (see #1457) 2017-10-25 14:08:26 +00:00			`meta['vectors'] = {'width': nlp.vocab.vectors_length,`
Update vector meta in meta.json 2017-11-01 00:25:09 +00:00			`'vectors': len(nlp.vocab.vectors),`
			`'keys': nlp.vocab.vectors.n_keys}`
Add meta.json option to cli.train and add relevant properties Add accuracy scores to meta.json instead of accuracy.json and replace all relevant properties like lang, pipeline, spacy_version in existing meta.json. If not present, also add name and version placeholders to make it packagable. 2017-09-25 17:00:47 +00:00			`meta['lang'] = nlp.lang`
			`meta['pipeline'] = pipeline`
			`meta['spacy_version'] = '>=%s' % about.__version__`
			`meta.setdefault('name', 'model%d' % i)`
Add version option to cli.train 2017-09-26 15:34:52 +00:00			`meta.setdefault('version', version)`
Add meta.json option to cli.train and add relevant properties Add accuracy scores to meta.json instead of accuracy.json and replace all relevant properties like lang, pipeline, spacy_version in existing meta.json. If not present, also add name and version placeholders to make it packagable. 2017-09-25 17:00:47 +00:00
			`with meta_loc.open('w') as file_:`
			`file_.write(json_dumps(meta))`
Improve train CLI script 2017-06-03 18:28:20 +00:00			`util.set_env_log(True)`
Tidy up CLI 2017-10-27 12:38:39 +00:00			`print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps,`
			`gpu_wps=gpu_wps)`
Improve model saving in train script 2017-05-26 10:52:09 +00:00			`finally:`
			`print("Saving model...")`
Wrap model saving in try/except 2017-10-05 13:12:50 +00:00			`try:`
			`with (output_path / 'model-final.pickle').open('wb') as file_:`
			`with nlp.use_params(optimizer.averages):`
			`dill.dump(nlp, file_, -1)`
			`except:`
			`print("Error saving model")`
Update the train script, fixing GPU memory leak 2017-05-19 23:15:50 +00:00

			`def _render_parses(i, to_render):`
			`to_render[0].user_data['title'] = "Batch %d" % i`
			`with Path('/tmp/entities.html').open('w') as file_:`
			`html = displacy.render(to_render[:5], style='ent', page=True)`
			`file_.write(html)`
			`with Path('/tmp/parses.html').open('w') as file_:`
			`html = displacy.render(to_render[:5], style='dep', page=True)`
			`file_.write(html)`
Add spacy train work in progress 2017-03-23 10:08:41 +00:00

Add speed benchmarks to metadata 2017-10-09 13:05:37 +00:00			`def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0):`
Redesign training to integrate NN components * Obsolete .parser, .entity etc names in favour of .pipeline * Components no longer create models on initialization * Models created by loading method (from_disk(), from_bytes() etc), or .begin_training() * Add .predict(), .set_annotations() methods in components * Pass state through pipeline, to allow components to share information more flexibly. 2017-05-16 14:17:30 +00:00			`scores = {}`
Fix train command line args 2017-05-22 15:41:39 +00:00			`for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',`
Add speed benchmarks to metadata 2017-10-09 13:05:37 +00:00			`'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']:`
Redesign training to integrate NN components * Obsolete .parser, .entity etc names in favour of .pipeline * Components no longer create models on initialization * Models created by loading method (from_disk(), from_bytes() etc), or .begin_training() * Add .predict(), .set_annotations() methods in components * Pass state through pipeline, to allow components to share information more flexibly. 2017-05-16 14:17:30 +00:00			`scores[col] = 0.0`
Disable gold preprocessing 2017-05-25 01:10:20 +00:00			`scores['dep_loss'] = losses.get('parser', 0.0)`
Print NER loss 2017-09-28 13:05:31 +00:00			`scores['ner_loss'] = losses.get('ner', 0.0)`
Disable gold preprocessing 2017-05-25 01:10:20 +00:00			`scores['tag_loss'] = losses.get('tagger', 0.0)`
Redesign training to integrate NN components * Obsolete .parser, .entity etc names in favour of .pipeline * Components no longer create models on initialization * Models created by loading method (from_disk(), from_bytes() etc), or .begin_training() * Add .predict(), .set_annotations() methods in components * Pass state through pipeline, to allow components to share information more flexibly. 2017-05-16 14:17:30 +00:00			`scores.update(dev_scores)`
Add speed benchmarks to metadata 2017-10-09 13:05:37 +00:00			`scores['cpu_wps'] = cpu_wps`
			`scores['gpu_wps'] = gpu_wps or 0.0`
Fix train command line args 2017-05-22 15:41:39 +00:00			`tpl = '\t'.join((`
			`'{:d}',`
			`'{dep_loss:.3f}',`
Print NER loss 2017-09-28 13:05:31 +00:00			`'{ner_loss:.3f}',`
Fix train command line args 2017-05-22 15:41:39 +00:00			`'{uas:.3f}',`
			`'{ents_p:.3f}',`
			`'{ents_r:.3f}',`
			`'{ents_f:.3f}',`
			`'{tags_acc:.3f}',`
Revert "Revert "WIP on improving parser efficiency"" This reverts commit 532afef4a811d5c71c75f5e63fbec3232f6ea937. 2017-05-23 08:06:53 +00:00			`'{token_acc:.3f}',`
Add speed benchmarks to metadata 2017-10-09 13:05:37 +00:00			`'{cpu_wps:.1f}',`
			`'{gpu_wps:.1f}',`
			`))`
Redesign training to integrate NN components * Obsolete .parser, .entity etc names in favour of .pipeline * Components no longer create models on initialization * Models created by loading method (from_disk(), from_bytes() etc), or .begin_training() * Add .predict(), .set_annotations() methods in components * Pass state through pipeline, to allow components to share information more flexibly. 2017-05-16 14:17:30 +00:00			`print(tpl.format(itn, **scores))`
Add spacy train work in progress 2017-03-23 10:08:41 +00:00

			`def print_results(scorer):`
Update train CLI 2017-03-26 12:16:52 +00:00			`results = {`
			`'TOK': '%.2f' % scorer.token_acc,`
			`'POS': '%.2f' % scorer.tags_acc,`
			`'UAS': '%.2f' % scorer.uas,`
			`'LAS': '%.2f' % scorer.las,`
			`'NER P': '%.2f' % scorer.ents_p,`
			`'NER R': '%.2f' % scorer.ents_r,`
			`'NER F': '%.2f' % scorer.ents_f}`
Add spacy train work in progress 2017-03-23 10:08:41 +00:00			`util.print_table(results, title="Results")`