Fix bugs in deep_learning_keras example.

This commit is contained in:
Matthew Honnibal 2016-10-20 02:49:14 +02:00
parent 3f545f50b5
commit 4c27958990
1 changed files with 52 additions and 51 deletions

View File

@ -1,5 +1,13 @@
import plac
import collections
import random
import cytoolz
import numpy import numpy
from collections import defaultdict from keras.layers import Sequential, LSTM, Dense, Embedding, Dropout
from keras.wrappers import Bidirectional
from keras import model_from_json
import cPickle as pickle
import spacy import spacy
@ -7,7 +15,14 @@ import spacy
class SentimentAnalyser(object): class SentimentAnalyser(object):
@classmethod @classmethod
def load(cls, path, nlp): def load(cls, path, nlp):
pass with (path / 'config.json').open() as file_:
model = model_from_json(file_.read())
with (path / 'model').open('rb') as file_:
lstm_weights = pickle.load(file_)
embeddings = get_embeddings(nlp.vocab)
model.set_weights([embeddings] + lstm_weights)
return cls(model)
def __init__(self, model): def __init__(self, model):
self._model = model self._model = model
@ -18,24 +33,40 @@ class SentimentAnalyser(object):
self.set_sentiment(doc, y) self.set_sentiment(doc, y)
def pipe(self, docs, batch_size=1000, n_threads=2): def pipe(self, docs, batch_size=1000, n_threads=2):
for minibatch in partition_all(batch_size, docs): for minibatch in cytoolz.partition_all(batch_size, docs):
Xs = _get_features(minibatch) Xs = get_features(minibatch, self.max_length)
ys = self._model.predict(X) ys = self._model.predict(Xs)
for i, doc in enumerate(minibatch): for i, doc in enumerate(minibatch):
doc.user_data['sentiment'] = ys[i] doc.user_data['sentiment'] = ys[i]
def set_sentiment(self, doc, y): def set_sentiment(self, doc, y):
doc.user_data['sentiment'] = y doc.sentiment = float(y[0])
# Sentiment has a native slot for a single float.
# For arbitrary data storage, there's:
# doc.user_data['my_data'] = y
def get_features(docs, max_length): def get_features(docs, max_length):
Xs = numpy.zeros(len(docs), max_length, dtype='int32') Xs = numpy.zeros(len(docs), max_length, dtype='int32')
for i, doc in enumerate(minibatch): for i, doc in enumerate(docs):
for j, token in enumerate(doc[:max_length]): for j, token in enumerate(doc[:max_length]):
Xs[i, j] = token.rank if token.has_vector else 0 Xs[i, j] = token.rank if token.has_vector else 0
return Xs return Xs
def compile_lstm(embeddings, shape, settings, optimizer):
def train(train_texts, train_labels, dev_texts, dev_labels,
lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5):
nlp = spacy.load('en', parser=False, tagger=False, entity=False)
embeddings = get_embeddings(nlp.vocab)
model = compile_lstm(embeddings, lstm_shape, lstm_settings)
train_X = get_features(nlp.pipe(train_texts), lstm_shape['max_length'])
dev_X = get_features(nlp.pipe(dev_texts), lstm_shape['max_length'])
model.fit(train_X, train_labels, dev_X, dev_labels,
nb_epoch=nb_epoch, batch_size=batch_size)
return model
def compile_lstm(embeddings, shape, settings):
model = Sequential() model = Sequential()
model.add( model.add(
Embedding( Embedding(
@ -53,42 +84,14 @@ def compile_lstm(embeddings, shape, settings, optimizer):
def get_embeddings(vocab): def get_embeddings(vocab):
''' max_rank = max(lex.rank for lex in vocab if lex.has_vector)
Get a numpy vector of the word embeddings. The Lexeme.rank attribute will vectors = numpy.ndarray((max_rank+1, vocab.vectors_length), dtype='float32')
be the index into the table. We're going to be "decadent" here and use
1m vectors, because we're not going to fine-tune them.
'''
max_rank = max(lex.rank for lex in nlp.vocab if lex.has_vector)
vectors = numpy.ndarray((max_rank+1, nlp.vocab.vectors_length), dtype='float32')
for lex in vocab: for lex in vocab:
if lex.has_vector: if lex.has_vector:
vectors[lex.rank] = lex.vector vectors[lex.rank] = lex.vector
return vectors return vectors
def train(train_texts, train_labels, dev_texts, dev_labels,
lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5):
nlp = spacy.load('en', parser=False, tagger=False, entity=False)
model = _compile_model(
_get_embeddings(
nlp.vocab),
lstm_shape,
lstm_settings,
lstm_optimizer)
model.fit(
_get_features(
nlp.pipe(
train_texts)),
train_ys,
_get_features(
nlp.pipe(
dev_texts)),
dev_ys,
nb_epoch=nb_epoch,
batch_size=batch_size)
model.save(model_dir)
def demonstrate_runtime(model_dir, texts): def demonstrate_runtime(model_dir, texts):
'''Demonstrate runtime usage of the custom sentiment model with spaCy. '''Demonstrate runtime usage of the custom sentiment model with spaCy.
@ -102,16 +105,11 @@ def demonstrate_runtime(model_dir, texts):
return [nlp.tagger, nlp.entity, SentimentAnalyser.load(model_dir, nlp)] return [nlp.tagger, nlp.entity, SentimentAnalyser.load(model_dir, nlp)]
nlp = spacy.load('en', create_pipeline=create_pipeline) nlp = spacy.load('en', create_pipeline=create_pipeline)
entity_sentiments = defaultdict(float)
entity_freqs = defaultdict(int) entity_sentiments = collections.Counter(float)
for doc in nlp.pipe(texts, batch_size=1000, n_threads=4): for doc in nlp.pipe(texts, batch_size=1000, n_threads=4):
sentiment = doc.user_data['sentiment']
for ent in doc.ents: for ent in doc.ents:
entity_sentiments[ent.text] += sentiment entity_sentiments[ent.text] += doc.sentiment
entity_freqs[ent.text] += 1
# Compute estimate of P(sentiment | entity)
for entity, sentiment in entity_freqs.items():
entity_sentiments[entity] /= entity_freqs[entity]
return entity_sentiments return entity_sentiments
@ -120,7 +118,7 @@ def read_data(data_dir, limit=0):
for subdir, label in (('pos', 1), ('neg', 0)): for subdir, label in (('pos', 1), ('neg', 0)):
for filename in (data_dir / subdir).iterdir(): for filename in (data_dir / subdir).iterdir():
with filename.open() as file_: with filename.open() as file_:
text = filename.read() text = file_.read()
examples.append((text, label)) examples.append((text, label))
random.shuffle(examples) random.shuffle(examples)
if limit >= 1: if limit >= 1:
@ -147,16 +145,19 @@ def main(model_dir, train_dir, dev_dir,
dropout=0.5, # General NN config dropout=0.5, # General NN config
nb_epoch=5, batch_size=100, nr_examples=-1): # Training params nb_epoch=5, batch_size=100, nr_examples=-1): # Training params
if is_runtime: if is_runtime:
dev_texts, dev_labels = read_dev(dev_dir) dev_texts, dev_labels = read_data(dev_dir)
demonstrate_runtime(model_dir, dev_texts) demonstrate_runtime(model_dir, dev_texts)
else: else:
train_texts, train_labels = read_data(train_dir, limit=nr_examples) train_texts, train_labels = read_data(train_dir, limit=nr_examples)
dev_texts, dev_labels = read_dev(dev_dir) dev_texts, dev_labels = read_data(dev_dir)
lstm = train(train_texts, train_labels, dev_texts, dev_labels, lstm = train(train_texts, train_labels, dev_texts, dev_labels,
{'nr_hidden': nr_hidden, 'max_length': max_length}, {'nr_hidden': nr_hidden, 'max_length': max_length},
{'dropout': 0.5}, {'dropout': 0.5},
{}, {},
nb_epoch=nb_epoch, batch_size=batch_size) nb_epoch=nb_epoch, batch_size=batch_size)
weights = lstm.get_weights()
with (model_dir / 'model').open('wb') as file_:
pickle.dump(file_, weights[1:])
if __name__ == '__main__': if __name__ == '__main__':