Rename entailment example

2016-11-01 01:51:54 +01:00 · 2016-11-01 01:51:54 +01:00 · 1b9c6240a7
parent 58f7be93ee
commit 1b9c6240a7
4 changed files with 461 additions and 0 deletions
--- a/examples/keras_parikh_entailment/README.md
+++ b/examples/keras_parikh_entailment/README.md
@ -0,0 +1,77 @@
+# A Decomposable Attention Model for Natural Language Inference
+
+This directory contains an implementation of entailment prediction model described
+by Parikh et al. (2016). The model is notable for its competitive performance
+with very few parameters.
+
+https://arxiv.org/pdf/1606.01933.pdf
+
+The model is implemented using Keras and spaCy. Keras is used to build and
+train the network, while spaCy is used to load the GloVe vectors, perform the
+feature extraction, and help you apply the model at run-time. The following
+demo code shows how the entailment model can be used at runtime, once the hook is
+installed to customise the `.similarity()` method of spaCy's `Doc` and `Span`
+objects:
+
+    def demo(model_dir):
+        nlp = spacy.load('en', path=model_dir,
+                create_pipeline=create_similarity_pipeline)
+        doc1 = nlp(u'Worst fries ever! Greasy and horrible...')
+        doc2 = nlp(u'The milkshakes are good. The fries are bad.')
+        print(doc1.similarity(doc2))
+        sent1a, sent1b = doc1.sents
+        print(sent1a.similarity(sent1b))
+        print(sent1a.similarity(doc2))
+        print(sent1b.similarity(doc2))
+
+I'm working on a blog post to explain Parikh et al.'s model in more detail.
+I think it is a very interesting example of the attention mechanism, which
+I didn't understand very well before working through this paper.
+
+# How to run the example
+
+1. Install spaCy and its English models (about 1GB of data):
+
+    pip install spacy
+    python -m spacy.en.download
+
+This will give you the spaCy's tokenization, tagging, NER and parsing models,
+as well as the GloVe word vectors.
+
+2. Install Keras
+
+    pip install keras
+
+3. Get Keras working with your GPU
+
+You're mostly on your own here. My only advice is, if you're setting up on AWS,
+try using the AMI published by NVidia. With the image, getting everything set
+up wasn't *too* painful. 
+
+4. Test the Keras model:
+
+    py.test nli/keras_decomposable_attention.py
+
+This should tell you that two tests passed.
+
+5. Download the Stanford Natural Language Inference data
+
+http://nlp.stanford.edu/projects/snli/
+
+6. Train the model:
+
+    python nli/ train <your_model_dir> <train_directory> <dev_directory>
+
+Training takes about 300 epochs for full accuracy, and I haven't rerun the full
+experiment since refactoring things to publish this example --- please let me
+know if I've broken something.
+
+You should get to at least 85% on the development data.
+
+7. Evaluate the model (optional):
+
+    python nli/ evaluate <your_model_dir> <dev_directory>
+
+8. Run the demo (optional):
+
+    python nli/ demo <your_model_dir>
--- a/examples/keras_parikh_entailment/main.py
+++ b/examples/keras_parikh_entailment/main.py
@ -0,0 +1,105 @@
+from __future__ import division, unicode_literals, print_function
+import spacy
+
+import plac
+from pathlib import Path
+
+from spacy_hook import get_embeddings, get_word_ids
+from spacy_hook import create_similarity_pipeline
+
+
+def train(model_dir, train_loc, dev_loc, shape, settings):
+    print("Loading spaCy")
+    nlp = spacy.load('en', tagger=False, parser=False, entity=False, matcher=False)
+    print("Compiling network")
+    model = build_model(get_embeddings(nlp.vocab), shape, settings)
+    print("Processing texts...")
+    train_X = get_features(list(nlp.pipe(train_texts)))
+    dev_X = get_features(list(nlp.pipe(dev_texts)))
+
+    model.fit(
+        train_X,
+        train_labels,
+        validation_data=(dev_X, dev_labels),
+        nb_epoch=settings['nr_epoch'],
+        batch_size=settings['batch_size'])
+
+
+def evaluate(model_dir, dev_loc):
+    nlp = spacy.load('en', path=model_dir,
+            tagger=False, parser=False, entity=False, matcher=False,
+            create_pipeline=create_similarity_pipeline)
+    n = 0
+    correct = 0
+    for (text1, text2), label in zip(dev_texts, dev_labels):
+        doc1 = nlp(text1)
+        doc2 = nlp(text2)
+        sim = doc1.similarity(doc2)
+        if bool(sim >= 0.5) == label:
+            correct += 1
+        n += 1
+    return correct, total
+
+
+def demo(model_dir):
+    nlp = spacy.load('en', path=model_dir,
+            tagger=False, parser=False, entity=False, matcher=False,
+            create_pipeline=create_similarity_pipeline)
+    doc1 = nlp(u'Worst fries ever! Greasy and horrible...')
+    doc2 = nlp(u'The milkshakes are good. The fries are bad.')
+    print('doc1.similarity(doc2)', doc1.similarity(doc2))
+    sent1a, sent1b = doc1.sents
+    print('sent1a.similarity(sent1b)', sent1a.similarity(sent1b))
+    print('sent1a.similarity(doc2)', sent1a.similarity(doc2))
+    print('sent1b.similarity(doc2)', sent1b.similarity(doc2))
+
+
+LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
+def read_snli(loc):
+    with open(loc) as file_:
+        for line in file_:
+            eg = json.loads(line)
+            label = eg['gold_label']
+            if label == '-':
+                continue
+            text1 = eg['sentence1']
+            text2 = eg['sentence2']
+            yield text1, text2, LABELS[label]
+
+
+@plac.annotations(
+    mode=("Mode to execute", "positional", None, str, ["train", "evaluate", "demo"]),
+    model_dir=("Path to spaCy model directory", "positional", None, Path),
+    train_loc=("Path to training data", "positional", None, Path),
+    dev_loc=("Path to development data", "positional", None, Path),
+    max_length=("Length to truncate sentences", "option", "L", int),
+    nr_hidden=("Number of hidden units", "option", "H", int),
+    dropout=("Dropout level", "option", "d", float),
+    learn_rate=("Learning rate", "option", "e", float),
+    batch_size=("Batch size for neural network training", "option", "b", float),
+    nr_epoch=("Number of training epochs", "option", "i", float)
+)
+def main(mode, model_dir, train_loc, dev_loc,
+        max_length=100,
+        nr_hidden=100,
+        dropout=0.2,
+        learn_rate=0.001,
+        batch_size=100,
+        nr_epoch=5):
+    shape = (max_length, nr_hidden, 3)
+    settings = {
+        'lr': learn_rate,
+        'dropout': dropout,
+        'batch_size': batch_size,
+        'nr_epoch': nr_epoch
+    }
+    if mode == 'train':
+        train(model_dir, train_loc, dev_loc, shape, settings)
+    elif mode == 'evaluate':
+        evaluate(model_dir, dev_loc)
+    else:
+        demo(model_dir)
+
+
+if __name__ == '__main__':
+    plac.call(main)
--- a/examples/keras_parikh_entailment/keras_decomposable_attention.py
+++ b/examples/keras_parikh_entailment/keras_decomposable_attention.py
@ -0,0 +1,217 @@
+# Semantic similarity with decomposable attention (using spaCy and Keras)
+# Practical state-of-the-art text similarity with spaCy and Keras
+import numpy
+
+from keras.layers import InputSpec, Layer, Input, Dense, merge
+from keras.layers import Activation, Dropout, Embedding, TimeDistributed
+import keras.backend as K
+import theano.tensor as T
+from keras.models import Sequential, Model, model_from_json
+from keras.regularizers import l2
+from keras.optimizers import Adam
+from keras.layers.normalization import BatchNormalization
+
+
+def build_model(vectors, shape, settings):
+    '''Compile the model.'''
+    max_length, nr_hidden, nr_class = shape
+    # Declare inputs.
+    ids1 = Input(shape=(max_length,), dtype='int32', name='words1')
+    ids2 = Input(shape=(max_length,), dtype='int32', name='words2')
+
+    # Construct operations, which we'll chain together.
+    embed = _StaticEmbedding(vectors, max_length, nr_hidden)
+    attend = _Attention(max_length, nr_hidden)
+    align = _SoftAlignment(max_length, nr_hidden)
+    compare = _Comparison(max_length, nr_hidden)
+    entail = _Entailment(nr_hidden, nr_class)
+    
+    # Declare the model as a computational graph.
+    sent1 = embed(ids1) # Shape: (i, n)
+    sent2 = embed(ids2) # Shape: (j, n)
+
+    attention = attend(sent1, sent2)  # Shape: (i, j)
+
+    align1 = align(sent2, attention)
+    align2 = align(sent1, attention, transpose=True)
+    
+    feats1 = compare(sent1, align1)
+    feats2 = compare(sent2, align2)
+    
+    scores = entail(feats1, feats2)
+    
+    # Now that we have the input/output, we can construct the Model object...
+    model = Model(input=[ids1, ids2], output=[scores])
+
+    # ...Compile it...
+    model.compile(
+        optimizer=Adam(lr=settings['lr']),
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+    # ...And return it for training.
+    return model
+
+
+class _StaticEmbedding(object):
+    def __init__(self, vectors, max_length, nr_out):
+        self.embed = Embedding(
+                        vectors.shape[0],
+                        vectors.shape[1],
+                        input_length=max_length,
+                        weights=[vectors],
+                        name='embed',
+                        trainable=False,
+                        dropout=0.0)
+
+        self.project = TimeDistributed(
+                            Dense(
+                                nr_out,
+                                activation=None,
+                                bias=False,
+                                name='project'))
+
+    def __call__(self, sentence):
+        return self.project(self.embed(sentence))
+
+
+class _Attention(object):
+    def __init__(self, max_length, nr_hidden, dropout=0.0, L2=1e-4, activation='relu'):
+        self.max_length = max_length
+        self.model = Sequential()
+        self.model.add(
+            Dense(nr_hidden, name='attend1',
+                init='he_normal', W_regularizer=l2(L2),
+                input_shape=(nr_hidden,), activation='relu'))
+        self.model.add(Dropout(dropout))
+        self.model.add(Dense(nr_hidden, name='attend2',
+            init='he_normal', W_regularizer=l2(L2), activation='relu'))
+        self.model = TimeDistributed(self.model)
+    
+    def __call__(self, sent1, sent2):
+        def _outer((A, B)):
+            att_ji = T.batched_dot(B, A.dimshuffle((0, 2, 1)))
+            return att_ji.dimshuffle((0, 2, 1))
+
+        return merge(
+                [self.model(sent1), self.model(sent2)],
+                mode=_outer,
+                output_shape=(self.max_length, self.max_length))
+
+
+class _SoftAlignment(object):
+    def __init__(self, max_length, nr_hidden):
+        self.max_length = max_length
+        self.nr_hidden = nr_hidden
+
+    def __call__(self, sentence, attention, transpose=False):
+        def _normalize_attention((att, mat)):
+            if transpose:
+                att = att.dimshuffle((0, 2, 1))
+            # 3d softmax
+            e = K.exp(att - K.max(att, axis=-1, keepdims=True))
+            s = K.sum(e, axis=-1, keepdims=True)
+            sm_att = e / s
+            return T.batched_dot(sm_att, mat)
+        return merge([attention, sentence], mode=_normalize_attention,
+                      output_shape=(self.max_length, self.nr_hidden)) # Shape: (i, n)
+ 
+
+class _Comparison(object):
+    def __init__(self, words, nr_hidden, L2=1e-6, dropout=0.2):
+        self.words = words
+        self.model = Sequential()
+        self.model.add(Dense(nr_hidden, name='compare1',
+            init='he_normal', W_regularizer=l2(L2),
+            input_shape=(nr_hidden*2,)))
+        self.model.add(Activation('relu'))
+        self.model.add(Dropout(dropout))
+        self.model.add(Dense(nr_hidden, name='compare2',
+                        W_regularizer=l2(L2), init='he_normal'))
+        self.model.add(Activation('relu'))
+        self.model.add(Dropout(dropout))
+        self.model = TimeDistributed(self.model)
+
+    def __call__(self, sent, align, **kwargs):
+        result = self.model(merge([sent, align], mode='concat')) # Shape: (i, n)
+        result = _GlobalSumPooling1D()(result, mask=self.words)
+        return result
+ 
+
+class _Entailment(object):
+    def __init__(self, nr_hidden, nr_out, dropout=0.2, L2=1e-4):
+        self.model = Sequential()
+        self.model.add(Dense(nr_hidden, name='entail1',
+            init='he_normal', W_regularizer=l2(L2),
+            input_shape=(nr_hidden*2,)))
+        self.model.add(Activation('relu'))
+        self.model.add(Dropout(dropout))
+        self.model.add(Dense(nr_out, name='entail_out', activation='softmax',
+                        W_regularizer=l2(L2), init='zero'))
+
+    def __call__(self, feats1, feats2):
+        features = merge([feats1, feats2], mode='concat')
+        return self.model(features)
+
+
+class _GlobalSumPooling1D(Layer):
+    '''Global sum pooling operation for temporal data.
+
+    # Input shape
+        3D tensor with shape: `(samples, steps, features)`.
+
+    # Output shape
+        2D tensor with shape: `(samples, features)`.
+    '''
+    def __init__(self, **kwargs):
+        super(_GlobalSumPooling1D, self).__init__(**kwargs)
+        self.input_spec = [InputSpec(ndim=3)]
+
+    def get_output_shape_for(self, input_shape):
+        return (input_shape[0], input_shape[2])
+
+    def call(self, x, mask=None):
+        if mask is not None:
+            return K.sum(x * T.clip(mask, 0, 1), axis=1)
+        else:
+            return K.sum(x, axis=1)
+
+
+def test_build_model():
+    vectors = numpy.ndarray((100, 8), dtype='float32')
+    shape = (10, 16, 3)
+    settings = {'lr': 0.001, 'dropout': 0.2}
+    model = build_model(vectors, shape, settings)
+
+
+def test_fit_model():
+    def _generate_X(nr_example, length, nr_vector):
+        X1 = numpy.ndarray((nr_example, length), dtype='int32')
+        X1 *= X1 < nr_vector
+        X1 *= 0 <= X1
+        X2 = numpy.ndarray((nr_example, length), dtype='int32')
+        X2 *= X2 < nr_vector
+        X2 *= 0 <= X2
+        return [X1, X2]
+    def _generate_Y(nr_example, nr_class):
+        ys = numpy.zeros((nr_example, nr_class), dtype='int32')
+        for i in range(nr_example):
+            ys[i, i % nr_class] = 1
+        return ys
+
+    vectors = numpy.ndarray((100, 8), dtype='float32')
+    shape = (10, 16, 3)
+    settings = {'lr': 0.001, 'dropout': 0.2}
+    model = build_model(vectors, shape, settings)
+    
+    train_X = _generate_X(20, shape[0], vectors.shape[1])
+    train_Y = _generate_Y(20, shape[2])
+    dev_X = _generate_X(15, shape[0], vectors.shape[1])
+    dev_Y = _generate_Y(15, shape[2])
+
+    model.fit(train_X, train_Y, validation_data=(dev_X, dev_Y), nb_epoch=5,
+              batch_size=4)
+
+
+
+
+__all__ = [build_model]
--- a/examples/keras_parikh_entailment/spacy_hook.py
+++ b/examples/keras_parikh_entailment/spacy_hook.py
@ -0,0 +1,62 @@
+from keras.models import model_from_json
+
+
+class KerasSimilarityShim(object):
+    @classmethod
+    def load(cls, path, nlp, get_features=None):
+        if get_features is None:
+            get_features = doc2ids
+        with (path / 'config.json').open() as file_:
+            config = json.load(file_)
+        model = model_from_json(config['model'])
+        with (path / 'model').open('rb') as file_:
+            weights = pickle.load(file_)
+        embeddings = get_embeddings(nlp.vocab)
+        model.set_weights([embeddings] + weights)
+        return cls(model, get_features=get_features)
+
+    def __init__(self, model, get_features=None):
+        self.model = model
+        self.get_features = get_features
+
+    def __call__(self, doc):
+        doc.user_hooks['similarity'] = self.predict
+        doc.user_span_hooks['similarity'] = self.predict
+    
+    def predict(self, doc1, doc2):
+        x1 = self.get_features(doc1)
+        x2 = self.get_features(doc2)
+        scores = self.model.predict([x1, x2])
+        return scores[0]
+
+
+def get_embeddings(cls, vocab):
+    max_rank = max(lex.rank+1 for lex in vocab if lex.has_vector)
+    vectors = numpy.ndarray((max_rank+1, vocab.vectors_length), dtype='float32')
+    for lex in vocab:
+        if lex.has_vector:
+            vectors[lex.rank + 1] = lex.vector
+    return vectors
+
+
+def get_word_ids(docs, max_length=100):
+    Xs = numpy.zeros((len(docs), max_length), dtype='int32')
+    for i, doc in enumerate(docs):
+        j = 0
+        for token in doc:
+            if token.has_vector and not token.is_punct and not token.is_space:
+                Xs[i, j] = token.rank + 1
+                j += 1
+                if j >= max_length:
+                    break
+    return Xs
+
+
+def create_similarity_pipeline(nlp):
+    return [SimilarityModel.load(
+                nlp.path / 'similarity',
+                nlp,
+                feature_extracter=get_features)]
+
+
+