From 3376d4d6e85ae1da26f0c86711f5c12b3ad3b1b3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 19 May 2017 18:15:50 -0500 Subject: [PATCH] Update the train script, fixing GPU memory leak --- spacy/cli/train.py | 49 ++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 7ddc8d1cd..fa7d85798 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -17,7 +17,7 @@ from .. import displacy def train(language, output_dir, train_data, dev_data, n_iter, n_sents, - use_gpu, tagger, parser, ner, parser_L1): + use_gpu, no_tagger, no_parser, no_entities, parser_L1): output_path = util.ensure_path(output_dir) train_path = util.ensure_path(train_data) dev_path = util.ensure_path(dev_data) @@ -44,9 +44,11 @@ def train(language, output_dir, train_data, dev_data, n_iter, n_sents, 'lang': language, 'features': lang.Defaults.tagger_features} gold_train = list(read_gold_json(train_path, limit=n_sents)) - gold_dev = list(read_gold_json(dev_path, limit=n_sents)) if dev_path else None + gold_dev = list(read_gold_json(dev_path, limit=n_sents)) - train_model(lang, gold_train, gold_dev, output_path, n_iter, use_gpu=use_gpu) + train_model(lang, gold_train, gold_dev, output_path, n_iter, + no_tagger=no_tagger, no_parser=no_parser, no_entities=no_entities, + use_gpu=use_gpu) if gold_dev: scorer = evaluate(lang, gold_dev, output_path) print_results(scorer) @@ -65,34 +67,43 @@ def train_config(config): def train_model(Language, train_data, dev_data, output_path, n_iter, **cfg): print("Itn.\tDep. Loss\tUAS\tNER F.\tTag %\tToken %") - nlp = Language(pipeline=['token_vectors', 'tags', 'dependencies']) + pipeline = ['token_vectors', 'tags', 'dependencies', 'entities'] + if cfg.get('no_tagger') and 'tags' in pipeline: + pipeline.remove('tags') + if cfg.get('no_parser') and 'dependencies' in pipeline: + pipeline.remove('dependencies') + if cfg.get('no_entities') and 'entities' in pipeline: + pipeline.remove('entities') + print(pipeline) + nlp = Language(pipeline=pipeline) dropout = util.env_opt('dropout', 0.0) # TODO: Get spaCy using Thinc's trainer and optimizer with nlp.begin_training(train_data, **cfg) as (trainer, optimizer): - for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=True)): + for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=False)): losses = defaultdict(float) - to_render = [] for i, (docs, golds) in enumerate(epoch): - state = nlp.update(docs, golds, drop=dropout, sgd=optimizer) - losses['dep_loss'] += state.get('parser_loss', 0.0) - losses['tag_loss'] += state.get('tag_loss', 0.0) - to_render.insert(0, nlp(docs[-1].text)) - to_render[0].user_data['title'] = "Batch %d" % i - with Path('/tmp/entities.html').open('w') as file_: - html = displacy.render(to_render[:5], style='ent', page=True) - file_.write(html) - with Path('/tmp/parses.html').open('w') as file_: - html = displacy.render(to_render[:5], style='dep', page=True) - file_.write(html) + nlp.update(docs, golds, drop=dropout, sgd=optimizer) + for doc in docs: + doc.tensor = None + doc._py_tokens = [] if dev_data: with nlp.use_params(optimizer.averages): - dev_scores = trainer.evaluate(dev_data).scores + dev_scores = trainer.evaluate(dev_data, gold_preproc=False).scores else: dev_scores = defaultdict(float) print_progress(itn, losses, dev_scores) with (output_path / 'model.bin').open('wb') as file_: dill.dump(nlp, file_, -1) - #nlp.to_disk(output_path, tokenizer=False) + + +def _render_parses(i, to_render): + to_render[0].user_data['title'] = "Batch %d" % i + with Path('/tmp/entities.html').open('w') as file_: + html = displacy.render(to_render[:5], style='ent', page=True) + file_.write(html) + with Path('/tmp/parses.html').open('w') as file_: + html = displacy.render(to_render[:5], style='dep', page=True) + file_.write(html) def evaluate(Language, gold_tuples, path):