Disable gold preprocessing

This commit is contained in:
Matthew Honnibal 2017-05-24 20:10:20 -05:00
parent 467bbeadb8
commit 135a13790c
1 changed files with 9 additions and 6 deletions

View File

@ -68,14 +68,16 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
for i in range(n_iter): for i in range(n_iter):
with tqdm.tqdm(total=n_train_docs) as pbar: with tqdm.tqdm(total=n_train_docs) as pbar:
train_docs = corpus.train_docs(nlp, shuffle=i, projectivize=True) train_docs = corpus.train_docs(nlp, shuffle=i, projectivize=True,
gold_preproc=False)
losses = {}
idx = 0 idx = 0
while idx < n_train_docs: while idx < n_train_docs:
batch = list(cytoolz.take(int(batch_size), train_docs)) batch = list(cytoolz.take(int(batch_size), train_docs))
if not batch: if not batch:
break break
docs, golds = zip(*batch) docs, golds = zip(*batch)
nlp.update(docs, golds, drop=dropout, sgd=optimizer) nlp.update(docs, golds, drop=dropout, sgd=optimizer, losses=losses)
pbar.update(len(docs)) pbar.update(len(docs))
idx += len(docs) idx += len(docs)
batch_size *= batch_accel batch_size *= batch_accel
@ -83,12 +85,12 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
dropout = linear_decay(orig_dropout, dropout_decay, i*n_train_docs+idx) dropout = linear_decay(orig_dropout, dropout_decay, i*n_train_docs+idx)
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
start = timer() start = timer()
scorer = nlp.evaluate(corpus.dev_docs(nlp)) scorer = nlp.evaluate(corpus.dev_docs(nlp, gold_preproc=False))
end = timer() end = timer()
n_words = scorer.tokens.tp + scorer.tokens.fn n_words = scorer.tokens.tp + scorer.tokens.fn
assert n_words != 0 assert n_words != 0
wps = n_words / (end-start) wps = n_words / (end-start)
print_progress(i, {}, scorer.scores, wps=wps) print_progress(i, losses, scorer.scores, wps=wps)
with (output_path / 'model.bin').open('wb') as file_: with (output_path / 'model.bin').open('wb') as file_:
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
dill.dump(nlp, file_, -1) dill.dump(nlp, file_, -1)
@ -109,9 +111,10 @@ def print_progress(itn, losses, dev_scores, wps=0.0):
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
'ents_p', 'ents_r', 'ents_f', 'wps']: 'ents_p', 'ents_r', 'ents_f', 'wps']:
scores[col] = 0.0 scores[col] = 0.0
scores.update(losses) scores['dep_loss'] = losses.get('parser', 0.0)
scores['tag_loss'] = losses.get('tagger', 0.0)
scores.update(dev_scores) scores.update(dev_scores)
scores[wps] = wps scores['wps'] = wps
tpl = '\t'.join(( tpl = '\t'.join((
'{:d}', '{:d}',
'{dep_loss:.3f}', '{dep_loss:.3f}',