From 105aaadc07623300dff2ba83fca417e92e6accc0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 23 Oct 2016 23:17:41 +0200 Subject: [PATCH] Make deep_learning_keras example use sentences --- examples/deep_learning_keras.py | 81 ++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 27 deletions(-) diff --git a/examples/deep_learning_keras.py b/examples/deep_learning_keras.py index 43655c0db..c9ea7ff84 100644 --- a/examples/deep_learning_keras.py +++ b/examples/deep_learning_keras.py @@ -16,18 +16,18 @@ import spacy class SentimentAnalyser(object): @classmethod - def load(cls, path, nlp): + def load(cls, path, nlp, max_length=100): with (path / 'config.json').open() as file_: - model = model_from_json(file_.read()) with (path / 'model').open('rb') as file_: lstm_weights = pickle.load(file_) embeddings = get_embeddings(nlp.vocab) model.set_weights([embeddings] + lstm_weights) - return cls(model) + return cls(model, max_length=max_length) - def __init__(self, model): + def __init__(self, model, max_length=100): self._model = model + self.max_length = max_length def __call__(self, doc): X = get_features([doc], self.max_length) @@ -36,10 +36,16 @@ class SentimentAnalyser(object): def pipe(self, docs, batch_size=1000, n_threads=2): for minibatch in cytoolz.partition_all(batch_size, docs): - Xs = get_features(minibatch, self.max_length) + minibatch = list(minibatch) + sentences = [] + for doc in minibatch: + sentences.extend(doc.sents) + Xs = get_features(sentences, self.max_length) ys = self._model.predict(Xs) - for i, doc in enumerate(minibatch): - doc.user_data['sentiment'] = ys[i] + for sent, label in zip(sentences, ys): + sent.doc.sentiment += label - 0.5 + for doc in minibatch: + yield doc def set_sentiment(self, doc, y): doc.sentiment = float(y[0]) @@ -48,6 +54,16 @@ class SentimentAnalyser(object): # doc.user_data['my_data'] = y +def get_labelled_sentences(docs, doc_labels): + labels = [] + sentences = [] + for doc, y in zip(docs, doc_labels): + for sent in doc.sents: + sentences.append(sent) + labels.append(y) + return sentences, numpy.asarray(labels, dtype='int32') + + def get_features(docs, max_length): docs = list(docs) Xs = numpy.zeros((len(docs), max_length), dtype='int32') @@ -63,12 +79,21 @@ def get_features(docs, max_length): def train(train_texts, train_labels, dev_texts, dev_labels, - lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5): - nlp = spacy.load('en', parser=False, tagger=False, entity=False) + lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5, + by_sentence=True): + print("Loading spaCy") + nlp = spacy.load('en', entity=False) embeddings = get_embeddings(nlp.vocab) model = compile_lstm(embeddings, lstm_shape, lstm_settings) - train_X = get_features(nlp.pipe(train_texts), lstm_shape['max_length']) - dev_X = get_features(nlp.pipe(dev_texts), lstm_shape['max_length']) + print("Parsing texts...") + train_docs = list(nlp.pipe(train_texts, batch_size=5000, n_threads=3)) + dev_docs = list(nlp.pipe(dev_texts, batch_size=5000, n_threads=3)) + if by_sentence: + train_docs, train_labels = get_labelled_sentences(train_docs, train_labels) + dev_docs, dev_labels = get_labelled_sentences(dev_docs, dev_labels) + + train_X = get_features(train_docs, lstm_shape['max_length']) + dev_X = get_features(dev_docs, lstm_shape['max_length']) model.fit(train_X, train_labels, validation_data=(dev_X, dev_labels), nb_epoch=nb_epoch, batch_size=batch_size) return model @@ -86,7 +111,7 @@ def compile_lstm(embeddings, shape, settings): mask_zero=True ) ) - model.add(TimeDistributed(Dense(shape['nr_hidden'] * 2))) + model.add(TimeDistributed(Dense(shape['nr_hidden'] * 2, bias=False))) model.add(Dropout(settings['dropout'])) model.add(Bidirectional(LSTM(shape['nr_hidden']))) model.add(Dropout(settings['dropout'])) @@ -105,25 +130,23 @@ def get_embeddings(vocab): return vectors -def demonstrate_runtime(model_dir, texts): - '''Demonstrate runtime usage of the custom sentiment model with spaCy. - - Here we return a dictionary mapping entities to the average sentiment of the - documents they occurred in. - ''' +def evaluate(model_dir, texts, labels, max_length=100): def create_pipeline(nlp): ''' This could be a lambda, but named functions are easier to read in Python. ''' - return [nlp.tagger, nlp.entity, SentimentAnalyser.load(model_dir, nlp)] + return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp, + max_length=max_length)] - nlp = spacy.load('en', create_pipeline=create_pipeline) + nlp = spacy.load('en') + nlp.pipeline = create_pipeline(nlp) - entity_sentiments = collections.Counter(float) + correct = 0 + i = 0 for doc in nlp.pipe(texts, batch_size=1000, n_threads=4): - for ent in doc.ents: - entity_sentiments[ent.text] += doc.sentiment - return entity_sentiments + correct += bool(doc.sentiment >= 0.5) == bool(labels[i]) + i += 1 + return float(correct) / i def read_data(data_dir, limit=0): @@ -162,10 +185,12 @@ def main(model_dir, train_dir, dev_dir, dev_dir = pathlib.Path(dev_dir) if is_runtime: dev_texts, dev_labels = read_data(dev_dir) - demonstrate_runtime(model_dir, dev_texts) + acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length) + print(acc) else: + print("Read data") train_texts, train_labels = read_data(train_dir, limit=nr_examples) - dev_texts, dev_labels = read_data(dev_dir) + dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples) train_labels = numpy.asarray(train_labels, dtype='int32') dev_labels = numpy.asarray(dev_labels, dtype='int32') lstm = train(train_texts, train_labels, dev_texts, dev_labels, @@ -175,7 +200,9 @@ def main(model_dir, train_dir, dev_dir, nb_epoch=nb_epoch, batch_size=batch_size) weights = lstm.get_weights() with (model_dir / 'model').open('wb') as file_: - pickle.dump(file_, weights[1:]) + pickle.dump(weights[1:], file_) + with (model_dir / 'config.json').open('wb') as file_: + file_.write(lstm.to_json()) if __name__ == '__main__':