Rename entailment example

2016-11-01 01:52:11 +01:00 · 2016-11-01 01:52:11 +01:00 · 0b7af54219
parent 1b9c6240a7
commit 0b7af54219
4 changed files with 0 additions and 461 deletions
--- a/examples/nli/README.md
+++ b/examples/nli/README.md
@ -1,77 +0,0 @@
 # A Decomposable Attention Model for Natural Language Inference
 This directory contains an implementation of entailment prediction model described
 by Parikh et al. (2016). The model is notable for its competitive performance
 with very few parameters.
 https://arxiv.org/pdf/1606.01933.pdf
 The model is implemented using Keras and spaCy. Keras is used to build and
 train the network, while spaCy is used to load the GloVe vectors, perform the
 feature extraction, and help you apply the model at run-time. The following
 demo code shows how the entailment model can be used at runtime, once the hook is
 installed to customise the `.similarity()` method of spaCy's `Doc` and `Span`
 objects:
    def demo(model_dir):
        nlp = spacy.load('en', path=model_dir,
                create_pipeline=create_similarity_pipeline)
        doc1 = nlp(u'Worst fries ever! Greasy and horrible...')
        doc2 = nlp(u'The milkshakes are good. The fries are bad.')
        print(doc1.similarity(doc2))
        sent1a, sent1b = doc1.sents
        print(sent1a.similarity(sent1b))
        print(sent1a.similarity(doc2))
        print(sent1b.similarity(doc2))
 I'm working on a blog post to explain Parikh et al.'s model in more detail.
 I think it is a very interesting example of the attention mechanism, which
 I didn't understand very well before working through this paper.
 # How to run the example
 1. Install spaCy and its English models (about 1GB of data):
    pip install spacy
    python -m spacy.en.download
 This will give you the spaCy's tokenization, tagging, NER and parsing models,
 as well as the GloVe word vectors.
 2. Install Keras
    pip install keras
 3. Get Keras working with your GPU
 You're mostly on your own here. My only advice is, if you're setting up on AWS,
 try using the AMI published by NVidia. With the image, getting everything set
 up wasn't *too* painful. 
 4. Test the Keras model:
    py.test nli/keras_decomposable_attention.py
 This should tell you that two tests passed.
 5. Download the Stanford Natural Language Inference data
 http://nlp.stanford.edu/projects/snli/
 6. Train the model:
    python nli/ train <your_model_dir> <train_directory> <dev_directory>
 Training takes about 300 epochs for full accuracy, and I haven't rerun the full
 experiment since refactoring things to publish this example --- please let me
 know if I've broken something.
 You should get to at least 85% on the development data.
 7. Evaluate the model (optional):
    python nli/ evaluate <your_model_dir> <dev_directory>
 8. Run the demo (optional):
    python nli/ demo <your_model_dir>
--- a/examples/nli/main.py
+++ b/examples/nli/main.py
@ -1,105 +0,0 @@
 from __future__ import division, unicode_literals, print_function
 import spacy
 import plac
 from pathlib import Path
 from spacy_hook import get_embeddings, get_word_ids
 from spacy_hook import create_similarity_pipeline
 def train(model_dir, train_loc, dev_loc, shape, settings):
    print("Loading spaCy")
    nlp = spacy.load('en', tagger=False, parser=False, entity=False, matcher=False)
    print("Compiling network")
    model = build_model(get_embeddings(nlp.vocab), shape, settings)
    print("Processing texts...")
    train_X = get_features(list(nlp.pipe(train_texts)))
    dev_X = get_features(list(nlp.pipe(dev_texts)))
    model.fit(
        train_X,
        train_labels,
        validation_data=(dev_X, dev_labels),
        nb_epoch=settings['nr_epoch'],
        batch_size=settings['batch_size'])
 def evaluate(model_dir, dev_loc):
    nlp = spacy.load('en', path=model_dir,
            tagger=False, parser=False, entity=False, matcher=False,
            create_pipeline=create_similarity_pipeline)
    n = 0
    correct = 0
    for (text1, text2), label in zip(dev_texts, dev_labels):
        doc1 = nlp(text1)
        doc2 = nlp(text2)
        sim = doc1.similarity(doc2)
        if bool(sim >= 0.5) == label:
            correct += 1
        n += 1
    return correct, total
 def demo(model_dir):
    nlp = spacy.load('en', path=model_dir,
            tagger=False, parser=False, entity=False, matcher=False,
            create_pipeline=create_similarity_pipeline)
    doc1 = nlp(u'Worst fries ever! Greasy and horrible...')
    doc2 = nlp(u'The milkshakes are good. The fries are bad.')
    print('doc1.similarity(doc2)', doc1.similarity(doc2))
    sent1a, sent1b = doc1.sents
    print('sent1a.similarity(sent1b)', sent1a.similarity(sent1b))
    print('sent1a.similarity(doc2)', sent1a.similarity(doc2))
    print('sent1b.similarity(doc2)', sent1b.similarity(doc2))
 LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
 def read_snli(loc):
    with open(loc) as file_:
        for line in file_:
            eg = json.loads(line)
            label = eg['gold_label']
            if label == '-':
                continue
            text1 = eg['sentence1']
            text2 = eg['sentence2']
            yield text1, text2, LABELS[label]
@plac.annotations(
    mode=("Mode to execute", "positional", None, str, ["train", "evaluate", "demo"]),
    model_dir=("Path to spaCy model directory", "positional", None, Path),
    train_loc=("Path to training data", "positional", None, Path),
    dev_loc=("Path to development data", "positional", None, Path),
    max_length=("Length to truncate sentences", "option", "L", int),
    nr_hidden=("Number of hidden units", "option", "H", int),
    dropout=("Dropout level", "option", "d", float),
    learn_rate=("Learning rate", "option", "e", float),
    batch_size=("Batch size for neural network training", "option", "b", float),
    nr_epoch=("Number of training epochs", "option", "i", float)
 )
 def main(mode, model_dir, train_loc, dev_loc,
        max_length=100,
        nr_hidden=100,
        dropout=0.2,
        learn_rate=0.001,
        batch_size=100,
        nr_epoch=5):
    shape = (max_length, nr_hidden, 3)
    settings = {
        'lr': learn_rate,
        'dropout': dropout,
        'batch_size': batch_size,
        'nr_epoch': nr_epoch
    }
    if mode == 'train':
        train(model_dir, train_loc, dev_loc, shape, settings)
    elif mode == 'evaluate':
        evaluate(model_dir, dev_loc)
    else:
        demo(model_dir)
 if __name__ == '__main__':
    plac.call(main)
--- a/examples/nli/keras_decomposable_attention.py
+++ b/examples/nli/keras_decomposable_attention.py
@ -1,217 +0,0 @@
 # Semantic similarity with decomposable attention (using spaCy and Keras)
 # Practical state-of-the-art text similarity with spaCy and Keras
 import numpy
 from keras.layers import InputSpec, Layer, Input, Dense, merge
 from keras.layers import Activation, Dropout, Embedding, TimeDistributed
 import keras.backend as K
 import theano.tensor as T
 from keras.models import Sequential, Model, model_from_json
 from keras.regularizers import l2
 from keras.optimizers import Adam
 from keras.layers.normalization import BatchNormalization
 def build_model(vectors, shape, settings):
    '''Compile the model.'''
    max_length, nr_hidden, nr_class = shape
    # Declare inputs.
    ids1 = Input(shape=(max_length,), dtype='int32', name='words1')
    ids2 = Input(shape=(max_length,), dtype='int32', name='words2')
    # Construct operations, which we'll chain together.
    embed = _StaticEmbedding(vectors, max_length, nr_hidden)
    attend = _Attention(max_length, nr_hidden)
    align = _SoftAlignment(max_length, nr_hidden)
    compare = _Comparison(max_length, nr_hidden)
    entail = _Entailment(nr_hidden, nr_class)
    # Declare the model as a computational graph.
    sent1 = embed(ids1) # Shape: (i, n)
    sent2 = embed(ids2) # Shape: (j, n)
    attention = attend(sent1, sent2)  # Shape: (i, j)
    align1 = align(sent2, attention)
    align2 = align(sent1, attention, transpose=True)
    feats1 = compare(sent1, align1)
    feats2 = compare(sent2, align2)
    scores = entail(feats1, feats2)
    # Now that we have the input/output, we can construct the Model object...
    model = Model(input=[ids1, ids2], output=[scores])
    # ...Compile it...
    model.compile(
        optimizer=Adam(lr=settings['lr']),
        loss='categorical_crossentropy',
        metrics=['accuracy'])
    # ...And return it for training.
    return model
 class _StaticEmbedding(object):
    def __init__(self, vectors, max_length, nr_out):
        self.embed = Embedding(
                        vectors.shape[0],
                        vectors.shape[1],
                        input_length=max_length,
                        weights=[vectors],
                        name='embed',
                        trainable=False,
                        dropout=0.0)
        self.project = TimeDistributed(
                            Dense(
                                nr_out,
                                activation=None,
                                bias=False,
                                name='project'))
    def __call__(self, sentence):
        return self.project(self.embed(sentence))
 class _Attention(object):
    def __init__(self, max_length, nr_hidden, dropout=0.0, L2=1e-4, activation='relu'):
        self.max_length = max_length
        self.model = Sequential()
        self.model.add(
            Dense(nr_hidden, name='attend1',
                init='he_normal', W_regularizer=l2(L2),
                input_shape=(nr_hidden,), activation='relu'))
        self.model.add(Dropout(dropout))
        self.model.add(Dense(nr_hidden, name='attend2',
            init='he_normal', W_regularizer=l2(L2), activation='relu'))
        self.model = TimeDistributed(self.model)
    def __call__(self, sent1, sent2):
        def _outer((A, B)):
            att_ji = T.batched_dot(B, A.dimshuffle((0, 2, 1)))
            return att_ji.dimshuffle((0, 2, 1))
        return merge(
                [self.model(sent1), self.model(sent2)],
                mode=_outer,
                output_shape=(self.max_length, self.max_length))
 class _SoftAlignment(object):
    def __init__(self, max_length, nr_hidden):
        self.max_length = max_length
        self.nr_hidden = nr_hidden
    def __call__(self, sentence, attention, transpose=False):
        def _normalize_attention((att, mat)):
            if transpose:
                att = att.dimshuffle((0, 2, 1))
            # 3d softmax
            e = K.exp(att - K.max(att, axis=-1, keepdims=True))
            s = K.sum(e, axis=-1, keepdims=True)
            sm_att = e / s
            return T.batched_dot(sm_att, mat)
        return merge([attention, sentence], mode=_normalize_attention,
                      output_shape=(self.max_length, self.nr_hidden)) # Shape: (i, n)
 class _Comparison(object):
    def __init__(self, words, nr_hidden, L2=1e-6, dropout=0.2):
        self.words = words
        self.model = Sequential()
        self.model.add(Dense(nr_hidden, name='compare1',
            init='he_normal', W_regularizer=l2(L2),
            input_shape=(nr_hidden*2,)))
        self.model.add(Activation('relu'))
        self.model.add(Dropout(dropout))
        self.model.add(Dense(nr_hidden, name='compare2',
                        W_regularizer=l2(L2), init='he_normal'))
        self.model.add(Activation('relu'))
        self.model.add(Dropout(dropout))
        self.model = TimeDistributed(self.model)
    def __call__(self, sent, align, **kwargs):
        result = self.model(merge([sent, align], mode='concat')) # Shape: (i, n)
        result = _GlobalSumPooling1D()(result, mask=self.words)
        return result
 class _Entailment(object):
    def __init__(self, nr_hidden, nr_out, dropout=0.2, L2=1e-4):
        self.model = Sequential()
        self.model.add(Dense(nr_hidden, name='entail1',
            init='he_normal', W_regularizer=l2(L2),
            input_shape=(nr_hidden*2,)))
        self.model.add(Activation('relu'))
        self.model.add(Dropout(dropout))
        self.model.add(Dense(nr_out, name='entail_out', activation='softmax',
                        W_regularizer=l2(L2), init='zero'))
    def __call__(self, feats1, feats2):
        features = merge([feats1, feats2], mode='concat')
        return self.model(features)
 class _GlobalSumPooling1D(Layer):
    '''Global sum pooling operation for temporal data.
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    '''
    def __init__(self, **kwargs):
        super(_GlobalSumPooling1D, self).__init__(**kwargs)
        self.input_spec = [InputSpec(ndim=3)]
    def get_output_shape_for(self, input_shape):
        return (input_shape[0], input_shape[2])
    def call(self, x, mask=None):
        if mask is not None:
            return K.sum(x * T.clip(mask, 0, 1), axis=1)
        else:
            return K.sum(x, axis=1)
 def test_build_model():
    vectors = numpy.ndarray((100, 8), dtype='float32')
    shape = (10, 16, 3)
    settings = {'lr': 0.001, 'dropout': 0.2}
    model = build_model(vectors, shape, settings)
 def test_fit_model():
    def _generate_X(nr_example, length, nr_vector):
        X1 = numpy.ndarray((nr_example, length), dtype='int32')
        X1 *= X1 < nr_vector
        X1 *= 0 <= X1
        X2 = numpy.ndarray((nr_example, length), dtype='int32')
        X2 *= X2 < nr_vector
        X2 *= 0 <= X2
        return [X1, X2]
    def _generate_Y(nr_example, nr_class):
        ys = numpy.zeros((nr_example, nr_class), dtype='int32')
        for i in range(nr_example):
            ys[i, i % nr_class] = 1
        return ys
    vectors = numpy.ndarray((100, 8), dtype='float32')
    shape = (10, 16, 3)
    settings = {'lr': 0.001, 'dropout': 0.2}
    model = build_model(vectors, shape, settings)
    train_X = _generate_X(20, shape[0], vectors.shape[1])
    train_Y = _generate_Y(20, shape[2])
    dev_X = _generate_X(15, shape[0], vectors.shape[1])
    dev_Y = _generate_Y(15, shape[2])
    model.fit(train_X, train_Y, validation_data=(dev_X, dev_Y), nb_epoch=5,
              batch_size=4)
 __all__ = [build_model]
--- a/examples/nli/spacy_hook.py
+++ b/examples/nli/spacy_hook.py
@ -1,62 +0,0 @@
 from keras.models import model_from_json
 class KerasSimilarityShim(object):
    @classmethod
    def load(cls, path, nlp, get_features=None):
        if get_features is None:
            get_features = doc2ids
        with (path / 'config.json').open() as file_:
            config = json.load(file_)
        model = model_from_json(config['model'])
        with (path / 'model').open('rb') as file_:
            weights = pickle.load(file_)
        embeddings = get_embeddings(nlp.vocab)
        model.set_weights([embeddings] + weights)
        return cls(model, get_features=get_features)
    def __init__(self, model, get_features=None):
        self.model = model
        self.get_features = get_features
    def __call__(self, doc):
        doc.user_hooks['similarity'] = self.predict
        doc.user_span_hooks['similarity'] = self.predict
    def predict(self, doc1, doc2):
        x1 = self.get_features(doc1)
        x2 = self.get_features(doc2)
        scores = self.model.predict([x1, x2])
        return scores[0]
 def get_embeddings(cls, vocab):
    max_rank = max(lex.rank+1 for lex in vocab if lex.has_vector)
    vectors = numpy.ndarray((max_rank+1, vocab.vectors_length), dtype='float32')
    for lex in vocab:
        if lex.has_vector:
            vectors[lex.rank + 1] = lex.vector
    return vectors
 def get_word_ids(docs, max_length=100):
    Xs = numpy.zeros((len(docs), max_length), dtype='int32')
    for i, doc in enumerate(docs):
        j = 0
        for token in doc:
            if token.has_vector and not token.is_punct and not token.is_space:
                Xs[i, j] = token.rank + 1
                j += 1
                if j >= max_length:
                    break
    return Xs
 def create_similarity_pipeline(nlp):
    return [SimilarityModel.load(
                nlp.path / 'similarity',
                nlp,
                feature_extracter=get_features)]