mirror of https://github.com/explosion/spaCy.git
Delete old training scripts (resolves #911)
This commit is contained in:
parent
3f20efe165
commit
8bc05c2ba9
|
@ -1,130 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import os
|
|
||||||
from os import path
|
|
||||||
import shutil
|
|
||||||
import codecs
|
|
||||||
import random
|
|
||||||
import time
|
|
||||||
import gzip
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import cProfile
|
|
||||||
import pstats
|
|
||||||
|
|
||||||
import spacy.util
|
|
||||||
from spacy.en import English
|
|
||||||
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
|
|
||||||
|
|
||||||
from spacy.syntax.parser import GreedyParser
|
|
||||||
from spacy.syntax.parser import OracleError
|
|
||||||
from spacy.syntax.util import Config
|
|
||||||
|
|
||||||
|
|
||||||
def is_punct_label(label):
|
|
||||||
return label == 'P' or label.lower() == 'punct'
|
|
||||||
|
|
||||||
|
|
||||||
def read_gold(file_):
|
|
||||||
"""Read a standard CoNLL/MALT-style format"""
|
|
||||||
sents = []
|
|
||||||
for sent_str in file_.read().strip().split('\n\n'):
|
|
||||||
ids = []
|
|
||||||
words = []
|
|
||||||
heads = []
|
|
||||||
labels = []
|
|
||||||
tags = []
|
|
||||||
for i, line in enumerate(sent_str.split('\n')):
|
|
||||||
id_, word, pos_string, head_idx, label = _parse_line(line)
|
|
||||||
words.append(word)
|
|
||||||
if head_idx == -1:
|
|
||||||
head_idx = i
|
|
||||||
ids.append(id_)
|
|
||||||
heads.append(head_idx)
|
|
||||||
labels.append(label)
|
|
||||||
tags.append(pos_string)
|
|
||||||
text = ' '.join(words)
|
|
||||||
sents.append((text, [words], ids, words, tags, heads, labels))
|
|
||||||
return sents
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_line(line):
|
|
||||||
pieces = line.split()
|
|
||||||
id_ = int(pieces[0])
|
|
||||||
word = pieces[1]
|
|
||||||
pos = pieces[3]
|
|
||||||
head_idx = int(pieces[6])
|
|
||||||
label = pieces[7]
|
|
||||||
return id_, word, pos, head_idx, label
|
|
||||||
|
|
||||||
|
|
||||||
def iter_data(paragraphs, tokenizer, gold_preproc=False):
|
|
||||||
for raw, tokenized, ids, words, tags, heads, labels in paragraphs:
|
|
||||||
assert len(words) == len(heads)
|
|
||||||
for words in tokenized:
|
|
||||||
sent_ids = ids[:len(words)]
|
|
||||||
sent_tags = tags[:len(words)]
|
|
||||||
sent_heads = heads[:len(words)]
|
|
||||||
sent_labels = labels[:len(words)]
|
|
||||||
sent_heads = _map_indices_to_tokens(sent_ids, sent_heads)
|
|
||||||
tokens = tokenizer.tokens_from_list(words)
|
|
||||||
yield tokens, sent_tags, sent_heads, sent_labels
|
|
||||||
ids = ids[len(words):]
|
|
||||||
tags = tags[len(words):]
|
|
||||||
heads = heads[len(words):]
|
|
||||||
labels = labels[len(words):]
|
|
||||||
|
|
||||||
|
|
||||||
def _map_indices_to_tokens(ids, heads):
|
|
||||||
mapped = []
|
|
||||||
for head in heads:
|
|
||||||
if head not in ids:
|
|
||||||
mapped.append(None)
|
|
||||||
else:
|
|
||||||
mapped.append(ids.index(head))
|
|
||||||
return mapped
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(Language, dev_loc, model_dir):
|
|
||||||
global loss
|
|
||||||
nlp = Language()
|
|
||||||
n_corr = 0
|
|
||||||
pos_corr = 0
|
|
||||||
n_tokens = 0
|
|
||||||
total = 0
|
|
||||||
skipped = 0
|
|
||||||
loss = 0
|
|
||||||
with codecs.open(dev_loc, 'r', 'utf8') as file_:
|
|
||||||
paragraphs = read_gold(file_)
|
|
||||||
for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer):
|
|
||||||
assert len(tokens) == len(labels)
|
|
||||||
nlp.tagger.tag_from_strings(tokens, tag_strs)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
for i, token in enumerate(tokens):
|
|
||||||
try:
|
|
||||||
pos_corr += token.tag_ == tag_strs[i]
|
|
||||||
except:
|
|
||||||
print i, token.orth_, token.tag
|
|
||||||
raise
|
|
||||||
n_tokens += 1
|
|
||||||
if heads[i] is None:
|
|
||||||
skipped += 1
|
|
||||||
continue
|
|
||||||
if is_punct_label(labels[i]):
|
|
||||||
continue
|
|
||||||
n_corr += token.head.i == heads[i]
|
|
||||||
total += 1
|
|
||||||
print loss, skipped, (loss+skipped + total)
|
|
||||||
print pos_corr / n_tokens
|
|
||||||
return float(n_corr) / (total + loss)
|
|
||||||
|
|
||||||
|
|
||||||
def main(dev_loc, model_dir):
|
|
||||||
print evaluate(English, dev_loc, model_dir)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,261 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import os
|
|
||||||
from os import path
|
|
||||||
import shutil
|
|
||||||
import codecs
|
|
||||||
import random
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import cProfile
|
|
||||||
import pstats
|
|
||||||
import re
|
|
||||||
|
|
||||||
import spacy.util
|
|
||||||
from spacy.en import English
|
|
||||||
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
|
|
||||||
|
|
||||||
from spacy.syntax.util import Config
|
|
||||||
from spacy.gold import read_json_file
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
|
|
||||||
from spacy.scorer import Scorer
|
|
||||||
|
|
||||||
from spacy.syntax.parser import Parser, get_templates
|
|
||||||
from spacy._theano import TheanoModel
|
|
||||||
|
|
||||||
import theano
|
|
||||||
import theano.tensor as T
|
|
||||||
|
|
||||||
from theano.printing import Print
|
|
||||||
|
|
||||||
import numpy
|
|
||||||
from collections import OrderedDict, defaultdict
|
|
||||||
|
|
||||||
|
|
||||||
theano.config.profile = False
|
|
||||||
theano.config.floatX = 'float32'
|
|
||||||
floatX = theano.config.floatX
|
|
||||||
|
|
||||||
|
|
||||||
def L1(L1_reg, *weights):
|
|
||||||
return L1_reg * sum(abs(w).sum() for w in weights)
|
|
||||||
|
|
||||||
|
|
||||||
def L2(L2_reg, *weights):
|
|
||||||
return L2_reg * sum((w ** 2).sum() for w in weights)
|
|
||||||
|
|
||||||
|
|
||||||
def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6):
|
|
||||||
updates = OrderedDict()
|
|
||||||
for param in params:
|
|
||||||
value = param.get_value(borrow=True)
|
|
||||||
accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
|
|
||||||
broadcastable=param.broadcastable)
|
|
||||||
|
|
||||||
grad = T.grad(loss, param)
|
|
||||||
accu_new = rho * accu + (1 - rho) * grad ** 2
|
|
||||||
updates[accu] = accu_new
|
|
||||||
updates[param] = param - (eta * grad / T.sqrt(accu_new + eps))
|
|
||||||
return updates
|
|
||||||
|
|
||||||
|
|
||||||
def relu(x):
|
|
||||||
return x * (x > 0)
|
|
||||||
|
|
||||||
|
|
||||||
def feed_layer(activation, weights, bias, input_):
|
|
||||||
return activation(T.dot(input_, weights) + bias)
|
|
||||||
|
|
||||||
|
|
||||||
def init_weights(n_in, n_out):
|
|
||||||
rng = numpy.random.RandomState(1235)
|
|
||||||
|
|
||||||
weights = numpy.asarray(
|
|
||||||
rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in),
|
|
||||||
dtype=theano.config.floatX
|
|
||||||
)
|
|
||||||
bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
|
|
||||||
return [wrapper(weights, name='W'), wrapper(bias, name='b')]
|
|
||||||
|
|
||||||
|
|
||||||
def compile_model(n_classes, n_hidden, n_in, optimizer):
|
|
||||||
x = T.vector('x')
|
|
||||||
costs = T.ivector('costs')
|
|
||||||
loss = T.scalar('loss')
|
|
||||||
|
|
||||||
maxent_W, maxent_b = init_weights(n_hidden, n_classes)
|
|
||||||
hidden_W, hidden_b = init_weights(n_in, n_hidden)
|
|
||||||
|
|
||||||
# Feed the inputs forward through the network
|
|
||||||
p_y_given_x = feed_layer(
|
|
||||||
T.nnet.softmax,
|
|
||||||
maxent_W,
|
|
||||||
maxent_b,
|
|
||||||
feed_layer(
|
|
||||||
relu,
|
|
||||||
hidden_W,
|
|
||||||
hidden_b,
|
|
||||||
x))
|
|
||||||
|
|
||||||
loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8)
|
|
||||||
|
|
||||||
train_model = theano.function(
|
|
||||||
name='train_model',
|
|
||||||
inputs=[x, costs],
|
|
||||||
outputs=[p_y_given_x[0], T.grad(loss, x), loss],
|
|
||||||
updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]),
|
|
||||||
on_unused_input='warn'
|
|
||||||
)
|
|
||||||
|
|
||||||
evaluate_model = theano.function(
|
|
||||||
name='evaluate_model',
|
|
||||||
inputs=[x],
|
|
||||||
outputs=[
|
|
||||||
feed_layer(
|
|
||||||
T.nnet.softmax,
|
|
||||||
maxent_W,
|
|
||||||
maxent_b,
|
|
||||||
feed_layer(
|
|
||||||
relu,
|
|
||||||
hidden_W,
|
|
||||||
hidden_b,
|
|
||||||
x
|
|
||||||
)
|
|
||||||
)[0]
|
|
||||||
]
|
|
||||||
)
|
|
||||||
return train_model, evaluate_model
|
|
||||||
|
|
||||||
|
|
||||||
def score_model(scorer, nlp, annot_tuples, verbose=False):
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
gold = GoldParse(tokens, annot_tuples)
|
|
||||||
scorer.score(tokens, gold, verbose=verbose)
|
|
||||||
|
|
||||||
|
|
||||||
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
|
||||||
eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10,
|
|
||||||
seed=0, n_sents=0, verbose=False):
|
|
||||||
|
|
||||||
dep_model_dir = path.join(model_dir, 'deps')
|
|
||||||
pos_model_dir = path.join(model_dir, 'pos')
|
|
||||||
if path.exists(dep_model_dir):
|
|
||||||
shutil.rmtree(dep_model_dir)
|
|
||||||
if path.exists(pos_model_dir):
|
|
||||||
shutil.rmtree(pos_model_dir)
|
|
||||||
os.mkdir(dep_model_dir)
|
|
||||||
os.mkdir(pos_model_dir)
|
|
||||||
setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
|
|
||||||
|
|
||||||
Config.write(dep_model_dir, 'config',
|
|
||||||
seed=seed,
|
|
||||||
templates=tuple(),
|
|
||||||
labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
|
|
||||||
vector_lengths=(nv_word, nv_tag, nv_label),
|
|
||||||
hidden_nodes=nv_hidden,
|
|
||||||
eta=eta,
|
|
||||||
mu=mu
|
|
||||||
)
|
|
||||||
|
|
||||||
# Bake-in hyper-parameters
|
|
||||||
optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps)
|
|
||||||
nlp = Language(data_dir=model_dir)
|
|
||||||
n_classes = nlp.parser.model.n_classes
|
|
||||||
train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer)
|
|
||||||
nlp.parser.model = TheanoModel(n_classes, input_spec, train,
|
|
||||||
predict, model_loc)
|
|
||||||
|
|
||||||
if n_sents > 0:
|
|
||||||
gold_tuples = gold_tuples[:n_sents]
|
|
||||||
print "Itn.\tP.Loss\tUAS\tTag %\tToken %"
|
|
||||||
log_loc = path.join(model_dir, 'job.log')
|
|
||||||
for itn in range(n_iter):
|
|
||||||
scorer = Scorer()
|
|
||||||
loss = 0
|
|
||||||
for _, sents in gold_tuples:
|
|
||||||
for annot_tuples, ctnt in sents:
|
|
||||||
if len(annot_tuples[1]) == 1:
|
|
||||||
continue
|
|
||||||
score_model(scorer, nlp, annot_tuples)
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
gold = GoldParse(tokens, annot_tuples, make_projective=True)
|
|
||||||
assert gold.is_projective
|
|
||||||
loss += nlp.parser.train(tokens, gold)
|
|
||||||
nlp.tagger.train(tokens, gold.tags)
|
|
||||||
random.shuffle(gold_tuples)
|
|
||||||
logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
|
|
||||||
scorer.tags_acc,
|
|
||||||
scorer.token_acc)
|
|
||||||
print logline
|
|
||||||
with open(log_loc, 'aw') as file_:
|
|
||||||
file_.write(logline + '\n')
|
|
||||||
nlp.parser.model.end_training()
|
|
||||||
nlp.tagger.model.end_training()
|
|
||||||
nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(nlp, gold_tuples, gold_preproc=True):
|
|
||||||
scorer = Scorer()
|
|
||||||
for raw_text, sents in gold_tuples:
|
|
||||||
for annot_tuples, brackets in sents:
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
gold = GoldParse(tokens, annot_tuples)
|
|
||||||
scorer.score(tokens, gold)
|
|
||||||
return scorer
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
train_loc=("Location of training file or directory"),
|
|
||||||
dev_loc=("Location of development file or directory"),
|
|
||||||
model_dir=("Location of output model directory",),
|
|
||||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
|
||||||
n_sents=("Number of training sentences", "option", "n", int),
|
|
||||||
n_iter=("Number of training iterations", "option", "i", int),
|
|
||||||
verbose=("Verbose error reporting", "flag", "v", bool),
|
|
||||||
|
|
||||||
nv_word=("Word vector length", "option", "W", int),
|
|
||||||
nv_tag=("Tag vector length", "option", "T", int),
|
|
||||||
nv_label=("Label vector length", "option", "L", int),
|
|
||||||
nv_hidden=("Hidden nodes length", "option", "H", int),
|
|
||||||
eta=("Learning rate", "option", "E", float),
|
|
||||||
mu=("Momentum", "option", "M", float),
|
|
||||||
)
|
|
||||||
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False,
|
|
||||||
nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10,
|
|
||||||
eta=0.1, mu=0.9, eval_only=False):
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id']))
|
|
||||||
|
|
||||||
nlp = train(English, gold_train, model_dir,
|
|
||||||
feat_set='embed',
|
|
||||||
eta=eta, mu=mu,
|
|
||||||
nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden,
|
|
||||||
n_sents=n_sents, n_iter=n_iter,
|
|
||||||
verbose=verbose)
|
|
||||||
|
|
||||||
scorer = evaluate(nlp, list(read_json_file(dev_loc)))
|
|
||||||
|
|
||||||
print 'TOK', 100-scorer.token_acc
|
|
||||||
print 'POS', scorer.tags_acc
|
|
||||||
print 'UAS', scorer.uas
|
|
||||||
print 'LAS', scorer.las
|
|
||||||
|
|
||||||
print 'NER P', scorer.ents_p
|
|
||||||
print 'NER R', scorer.ents_r
|
|
||||||
print 'NER F', scorer.ents_f
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
Loading…
Reference in New Issue