From 1b9c6240a7d1eb66dde750200dde3d2e08e9014e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 1 Nov 2016 01:51:54 +0100 Subject: [PATCH] Rename entailment example --- examples/keras_parikh_entailment/README.md | 77 +++++++ examples/keras_parikh_entailment/__main__.py | 105 +++++++++ .../keras_decomposable_attention.py | 217 ++++++++++++++++++ .../keras_parikh_entailment/spacy_hook.py | 62 +++++ 4 files changed, 461 insertions(+) create mode 100644 examples/keras_parikh_entailment/README.md create mode 100644 examples/keras_parikh_entailment/__main__.py create mode 100644 examples/keras_parikh_entailment/keras_decomposable_attention.py create mode 100644 examples/keras_parikh_entailment/spacy_hook.py diff --git a/examples/keras_parikh_entailment/README.md b/examples/keras_parikh_entailment/README.md new file mode 100644 index 000000000..e4d63eb2c --- /dev/null +++ b/examples/keras_parikh_entailment/README.md @@ -0,0 +1,77 @@ +# A Decomposable Attention Model for Natural Language Inference + +This directory contains an implementation of entailment prediction model described +by Parikh et al. (2016). The model is notable for its competitive performance +with very few parameters. + +https://arxiv.org/pdf/1606.01933.pdf + +The model is implemented using Keras and spaCy. Keras is used to build and +train the network, while spaCy is used to load the GloVe vectors, perform the +feature extraction, and help you apply the model at run-time. The following +demo code shows how the entailment model can be used at runtime, once the hook is +installed to customise the `.similarity()` method of spaCy's `Doc` and `Span` +objects: + + def demo(model_dir): + nlp = spacy.load('en', path=model_dir, + create_pipeline=create_similarity_pipeline) + doc1 = nlp(u'Worst fries ever! Greasy and horrible...') + doc2 = nlp(u'The milkshakes are good. The fries are bad.') + print(doc1.similarity(doc2)) + sent1a, sent1b = doc1.sents + print(sent1a.similarity(sent1b)) + print(sent1a.similarity(doc2)) + print(sent1b.similarity(doc2)) + +I'm working on a blog post to explain Parikh et al.'s model in more detail. +I think it is a very interesting example of the attention mechanism, which +I didn't understand very well before working through this paper. + +# How to run the example + +1. Install spaCy and its English models (about 1GB of data): + + pip install spacy + python -m spacy.en.download + +This will give you the spaCy's tokenization, tagging, NER and parsing models, +as well as the GloVe word vectors. + +2. Install Keras + + pip install keras + +3. Get Keras working with your GPU + +You're mostly on your own here. My only advice is, if you're setting up on AWS, +try using the AMI published by NVidia. With the image, getting everything set +up wasn't *too* painful. + +4. Test the Keras model: + + py.test nli/keras_decomposable_attention.py + +This should tell you that two tests passed. + +5. Download the Stanford Natural Language Inference data + +http://nlp.stanford.edu/projects/snli/ + +6. Train the model: + + python nli/ train + +Training takes about 300 epochs for full accuracy, and I haven't rerun the full +experiment since refactoring things to publish this example --- please let me +know if I've broken something. + +You should get to at least 85% on the development data. + +7. Evaluate the model (optional): + + python nli/ evaluate + +8. Run the demo (optional): + + python nli/ demo diff --git a/examples/keras_parikh_entailment/__main__.py b/examples/keras_parikh_entailment/__main__.py new file mode 100644 index 000000000..35553af43 --- /dev/null +++ b/examples/keras_parikh_entailment/__main__.py @@ -0,0 +1,105 @@ +from __future__ import division, unicode_literals, print_function +import spacy + +import plac +from pathlib import Path + +from spacy_hook import get_embeddings, get_word_ids +from spacy_hook import create_similarity_pipeline + + +def train(model_dir, train_loc, dev_loc, shape, settings): + print("Loading spaCy") + nlp = spacy.load('en', tagger=False, parser=False, entity=False, matcher=False) + print("Compiling network") + model = build_model(get_embeddings(nlp.vocab), shape, settings) + print("Processing texts...") + train_X = get_features(list(nlp.pipe(train_texts))) + dev_X = get_features(list(nlp.pipe(dev_texts))) + + model.fit( + train_X, + train_labels, + validation_data=(dev_X, dev_labels), + nb_epoch=settings['nr_epoch'], + batch_size=settings['batch_size']) + + +def evaluate(model_dir, dev_loc): + nlp = spacy.load('en', path=model_dir, + tagger=False, parser=False, entity=False, matcher=False, + create_pipeline=create_similarity_pipeline) + n = 0 + correct = 0 + for (text1, text2), label in zip(dev_texts, dev_labels): + doc1 = nlp(text1) + doc2 = nlp(text2) + sim = doc1.similarity(doc2) + if bool(sim >= 0.5) == label: + correct += 1 + n += 1 + return correct, total + + +def demo(model_dir): + nlp = spacy.load('en', path=model_dir, + tagger=False, parser=False, entity=False, matcher=False, + create_pipeline=create_similarity_pipeline) + doc1 = nlp(u'Worst fries ever! Greasy and horrible...') + doc2 = nlp(u'The milkshakes are good. The fries are bad.') + print('doc1.similarity(doc2)', doc1.similarity(doc2)) + sent1a, sent1b = doc1.sents + print('sent1a.similarity(sent1b)', sent1a.similarity(sent1b)) + print('sent1a.similarity(doc2)', sent1a.similarity(doc2)) + print('sent1b.similarity(doc2)', sent1b.similarity(doc2)) + + +LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2} +def read_snli(loc): + with open(loc) as file_: + for line in file_: + eg = json.loads(line) + label = eg['gold_label'] + if label == '-': + continue + text1 = eg['sentence1'] + text2 = eg['sentence2'] + yield text1, text2, LABELS[label] + + +@plac.annotations( + mode=("Mode to execute", "positional", None, str, ["train", "evaluate", "demo"]), + model_dir=("Path to spaCy model directory", "positional", None, Path), + train_loc=("Path to training data", "positional", None, Path), + dev_loc=("Path to development data", "positional", None, Path), + max_length=("Length to truncate sentences", "option", "L", int), + nr_hidden=("Number of hidden units", "option", "H", int), + dropout=("Dropout level", "option", "d", float), + learn_rate=("Learning rate", "option", "e", float), + batch_size=("Batch size for neural network training", "option", "b", float), + nr_epoch=("Number of training epochs", "option", "i", float) +) +def main(mode, model_dir, train_loc, dev_loc, + max_length=100, + nr_hidden=100, + dropout=0.2, + learn_rate=0.001, + batch_size=100, + nr_epoch=5): + shape = (max_length, nr_hidden, 3) + settings = { + 'lr': learn_rate, + 'dropout': dropout, + 'batch_size': batch_size, + 'nr_epoch': nr_epoch + } + if mode == 'train': + train(model_dir, train_loc, dev_loc, shape, settings) + elif mode == 'evaluate': + evaluate(model_dir, dev_loc) + else: + demo(model_dir) + + +if __name__ == '__main__': + plac.call(main) diff --git a/examples/keras_parikh_entailment/keras_decomposable_attention.py b/examples/keras_parikh_entailment/keras_decomposable_attention.py new file mode 100644 index 000000000..21ecda447 --- /dev/null +++ b/examples/keras_parikh_entailment/keras_decomposable_attention.py @@ -0,0 +1,217 @@ +# Semantic similarity with decomposable attention (using spaCy and Keras) +# Practical state-of-the-art text similarity with spaCy and Keras +import numpy + +from keras.layers import InputSpec, Layer, Input, Dense, merge +from keras.layers import Activation, Dropout, Embedding, TimeDistributed +import keras.backend as K +import theano.tensor as T +from keras.models import Sequential, Model, model_from_json +from keras.regularizers import l2 +from keras.optimizers import Adam +from keras.layers.normalization import BatchNormalization + + +def build_model(vectors, shape, settings): + '''Compile the model.''' + max_length, nr_hidden, nr_class = shape + # Declare inputs. + ids1 = Input(shape=(max_length,), dtype='int32', name='words1') + ids2 = Input(shape=(max_length,), dtype='int32', name='words2') + + # Construct operations, which we'll chain together. + embed = _StaticEmbedding(vectors, max_length, nr_hidden) + attend = _Attention(max_length, nr_hidden) + align = _SoftAlignment(max_length, nr_hidden) + compare = _Comparison(max_length, nr_hidden) + entail = _Entailment(nr_hidden, nr_class) + + # Declare the model as a computational graph. + sent1 = embed(ids1) # Shape: (i, n) + sent2 = embed(ids2) # Shape: (j, n) + + attention = attend(sent1, sent2) # Shape: (i, j) + + align1 = align(sent2, attention) + align2 = align(sent1, attention, transpose=True) + + feats1 = compare(sent1, align1) + feats2 = compare(sent2, align2) + + scores = entail(feats1, feats2) + + # Now that we have the input/output, we can construct the Model object... + model = Model(input=[ids1, ids2], output=[scores]) + + # ...Compile it... + model.compile( + optimizer=Adam(lr=settings['lr']), + loss='categorical_crossentropy', + metrics=['accuracy']) + # ...And return it for training. + return model + + +class _StaticEmbedding(object): + def __init__(self, vectors, max_length, nr_out): + self.embed = Embedding( + vectors.shape[0], + vectors.shape[1], + input_length=max_length, + weights=[vectors], + name='embed', + trainable=False, + dropout=0.0) + + self.project = TimeDistributed( + Dense( + nr_out, + activation=None, + bias=False, + name='project')) + + def __call__(self, sentence): + return self.project(self.embed(sentence)) + + +class _Attention(object): + def __init__(self, max_length, nr_hidden, dropout=0.0, L2=1e-4, activation='relu'): + self.max_length = max_length + self.model = Sequential() + self.model.add( + Dense(nr_hidden, name='attend1', + init='he_normal', W_regularizer=l2(L2), + input_shape=(nr_hidden,), activation='relu')) + self.model.add(Dropout(dropout)) + self.model.add(Dense(nr_hidden, name='attend2', + init='he_normal', W_regularizer=l2(L2), activation='relu')) + self.model = TimeDistributed(self.model) + + def __call__(self, sent1, sent2): + def _outer((A, B)): + att_ji = T.batched_dot(B, A.dimshuffle((0, 2, 1))) + return att_ji.dimshuffle((0, 2, 1)) + + return merge( + [self.model(sent1), self.model(sent2)], + mode=_outer, + output_shape=(self.max_length, self.max_length)) + + +class _SoftAlignment(object): + def __init__(self, max_length, nr_hidden): + self.max_length = max_length + self.nr_hidden = nr_hidden + + def __call__(self, sentence, attention, transpose=False): + def _normalize_attention((att, mat)): + if transpose: + att = att.dimshuffle((0, 2, 1)) + # 3d softmax + e = K.exp(att - K.max(att, axis=-1, keepdims=True)) + s = K.sum(e, axis=-1, keepdims=True) + sm_att = e / s + return T.batched_dot(sm_att, mat) + return merge([attention, sentence], mode=_normalize_attention, + output_shape=(self.max_length, self.nr_hidden)) # Shape: (i, n) + + +class _Comparison(object): + def __init__(self, words, nr_hidden, L2=1e-6, dropout=0.2): + self.words = words + self.model = Sequential() + self.model.add(Dense(nr_hidden, name='compare1', + init='he_normal', W_regularizer=l2(L2), + input_shape=(nr_hidden*2,))) + self.model.add(Activation('relu')) + self.model.add(Dropout(dropout)) + self.model.add(Dense(nr_hidden, name='compare2', + W_regularizer=l2(L2), init='he_normal')) + self.model.add(Activation('relu')) + self.model.add(Dropout(dropout)) + self.model = TimeDistributed(self.model) + + def __call__(self, sent, align, **kwargs): + result = self.model(merge([sent, align], mode='concat')) # Shape: (i, n) + result = _GlobalSumPooling1D()(result, mask=self.words) + return result + + +class _Entailment(object): + def __init__(self, nr_hidden, nr_out, dropout=0.2, L2=1e-4): + self.model = Sequential() + self.model.add(Dense(nr_hidden, name='entail1', + init='he_normal', W_regularizer=l2(L2), + input_shape=(nr_hidden*2,))) + self.model.add(Activation('relu')) + self.model.add(Dropout(dropout)) + self.model.add(Dense(nr_out, name='entail_out', activation='softmax', + W_regularizer=l2(L2), init='zero')) + + def __call__(self, feats1, feats2): + features = merge([feats1, feats2], mode='concat') + return self.model(features) + + +class _GlobalSumPooling1D(Layer): + '''Global sum pooling operation for temporal data. + + # Input shape + 3D tensor with shape: `(samples, steps, features)`. + + # Output shape + 2D tensor with shape: `(samples, features)`. + ''' + def __init__(self, **kwargs): + super(_GlobalSumPooling1D, self).__init__(**kwargs) + self.input_spec = [InputSpec(ndim=3)] + + def get_output_shape_for(self, input_shape): + return (input_shape[0], input_shape[2]) + + def call(self, x, mask=None): + if mask is not None: + return K.sum(x * T.clip(mask, 0, 1), axis=1) + else: + return K.sum(x, axis=1) + + +def test_build_model(): + vectors = numpy.ndarray((100, 8), dtype='float32') + shape = (10, 16, 3) + settings = {'lr': 0.001, 'dropout': 0.2} + model = build_model(vectors, shape, settings) + + +def test_fit_model(): + def _generate_X(nr_example, length, nr_vector): + X1 = numpy.ndarray((nr_example, length), dtype='int32') + X1 *= X1 < nr_vector + X1 *= 0 <= X1 + X2 = numpy.ndarray((nr_example, length), dtype='int32') + X2 *= X2 < nr_vector + X2 *= 0 <= X2 + return [X1, X2] + def _generate_Y(nr_example, nr_class): + ys = numpy.zeros((nr_example, nr_class), dtype='int32') + for i in range(nr_example): + ys[i, i % nr_class] = 1 + return ys + + vectors = numpy.ndarray((100, 8), dtype='float32') + shape = (10, 16, 3) + settings = {'lr': 0.001, 'dropout': 0.2} + model = build_model(vectors, shape, settings) + + train_X = _generate_X(20, shape[0], vectors.shape[1]) + train_Y = _generate_Y(20, shape[2]) + dev_X = _generate_X(15, shape[0], vectors.shape[1]) + dev_Y = _generate_Y(15, shape[2]) + + model.fit(train_X, train_Y, validation_data=(dev_X, dev_Y), nb_epoch=5, + batch_size=4) + + + + +__all__ = [build_model] diff --git a/examples/keras_parikh_entailment/spacy_hook.py b/examples/keras_parikh_entailment/spacy_hook.py new file mode 100644 index 000000000..78e5ab71a --- /dev/null +++ b/examples/keras_parikh_entailment/spacy_hook.py @@ -0,0 +1,62 @@ +from keras.models import model_from_json + + +class KerasSimilarityShim(object): + @classmethod + def load(cls, path, nlp, get_features=None): + if get_features is None: + get_features = doc2ids + with (path / 'config.json').open() as file_: + config = json.load(file_) + model = model_from_json(config['model']) + with (path / 'model').open('rb') as file_: + weights = pickle.load(file_) + embeddings = get_embeddings(nlp.vocab) + model.set_weights([embeddings] + weights) + return cls(model, get_features=get_features) + + def __init__(self, model, get_features=None): + self.model = model + self.get_features = get_features + + def __call__(self, doc): + doc.user_hooks['similarity'] = self.predict + doc.user_span_hooks['similarity'] = self.predict + + def predict(self, doc1, doc2): + x1 = self.get_features(doc1) + x2 = self.get_features(doc2) + scores = self.model.predict([x1, x2]) + return scores[0] + + +def get_embeddings(cls, vocab): + max_rank = max(lex.rank+1 for lex in vocab if lex.has_vector) + vectors = numpy.ndarray((max_rank+1, vocab.vectors_length), dtype='float32') + for lex in vocab: + if lex.has_vector: + vectors[lex.rank + 1] = lex.vector + return vectors + + +def get_word_ids(docs, max_length=100): + Xs = numpy.zeros((len(docs), max_length), dtype='int32') + for i, doc in enumerate(docs): + j = 0 + for token in doc: + if token.has_vector and not token.is_punct and not token.is_space: + Xs[i, j] = token.rank + 1 + j += 1 + if j >= max_length: + break + return Xs + + +def create_similarity_pipeline(nlp): + return [SimilarityModel.load( + nlp.path / 'similarity', + nlp, + feature_extracter=get_features)] + + +