diff --git a/examples/nli/README.md b/examples/nli/README.md deleted file mode 100644 index e4d63eb2c..000000000 --- a/examples/nli/README.md +++ /dev/null @@ -1,77 +0,0 @@ -# A Decomposable Attention Model for Natural Language Inference - -This directory contains an implementation of entailment prediction model described -by Parikh et al. (2016). The model is notable for its competitive performance -with very few parameters. - -https://arxiv.org/pdf/1606.01933.pdf - -The model is implemented using Keras and spaCy. Keras is used to build and -train the network, while spaCy is used to load the GloVe vectors, perform the -feature extraction, and help you apply the model at run-time. The following -demo code shows how the entailment model can be used at runtime, once the hook is -installed to customise the `.similarity()` method of spaCy's `Doc` and `Span` -objects: - - def demo(model_dir): - nlp = spacy.load('en', path=model_dir, - create_pipeline=create_similarity_pipeline) - doc1 = nlp(u'Worst fries ever! Greasy and horrible...') - doc2 = nlp(u'The milkshakes are good. The fries are bad.') - print(doc1.similarity(doc2)) - sent1a, sent1b = doc1.sents - print(sent1a.similarity(sent1b)) - print(sent1a.similarity(doc2)) - print(sent1b.similarity(doc2)) - -I'm working on a blog post to explain Parikh et al.'s model in more detail. -I think it is a very interesting example of the attention mechanism, which -I didn't understand very well before working through this paper. - -# How to run the example - -1. Install spaCy and its English models (about 1GB of data): - - pip install spacy - python -m spacy.en.download - -This will give you the spaCy's tokenization, tagging, NER and parsing models, -as well as the GloVe word vectors. - -2. Install Keras - - pip install keras - -3. Get Keras working with your GPU - -You're mostly on your own here. My only advice is, if you're setting up on AWS, -try using the AMI published by NVidia. With the image, getting everything set -up wasn't *too* painful. - -4. Test the Keras model: - - py.test nli/keras_decomposable_attention.py - -This should tell you that two tests passed. - -5. Download the Stanford Natural Language Inference data - -http://nlp.stanford.edu/projects/snli/ - -6. Train the model: - - python nli/ train - -Training takes about 300 epochs for full accuracy, and I haven't rerun the full -experiment since refactoring things to publish this example --- please let me -know if I've broken something. - -You should get to at least 85% on the development data. - -7. Evaluate the model (optional): - - python nli/ evaluate - -8. Run the demo (optional): - - python nli/ demo diff --git a/examples/nli/__main__.py b/examples/nli/__main__.py deleted file mode 100644 index 35553af43..000000000 --- a/examples/nli/__main__.py +++ /dev/null @@ -1,105 +0,0 @@ -from __future__ import division, unicode_literals, print_function -import spacy - -import plac -from pathlib import Path - -from spacy_hook import get_embeddings, get_word_ids -from spacy_hook import create_similarity_pipeline - - -def train(model_dir, train_loc, dev_loc, shape, settings): - print("Loading spaCy") - nlp = spacy.load('en', tagger=False, parser=False, entity=False, matcher=False) - print("Compiling network") - model = build_model(get_embeddings(nlp.vocab), shape, settings) - print("Processing texts...") - train_X = get_features(list(nlp.pipe(train_texts))) - dev_X = get_features(list(nlp.pipe(dev_texts))) - - model.fit( - train_X, - train_labels, - validation_data=(dev_X, dev_labels), - nb_epoch=settings['nr_epoch'], - batch_size=settings['batch_size']) - - -def evaluate(model_dir, dev_loc): - nlp = spacy.load('en', path=model_dir, - tagger=False, parser=False, entity=False, matcher=False, - create_pipeline=create_similarity_pipeline) - n = 0 - correct = 0 - for (text1, text2), label in zip(dev_texts, dev_labels): - doc1 = nlp(text1) - doc2 = nlp(text2) - sim = doc1.similarity(doc2) - if bool(sim >= 0.5) == label: - correct += 1 - n += 1 - return correct, total - - -def demo(model_dir): - nlp = spacy.load('en', path=model_dir, - tagger=False, parser=False, entity=False, matcher=False, - create_pipeline=create_similarity_pipeline) - doc1 = nlp(u'Worst fries ever! Greasy and horrible...') - doc2 = nlp(u'The milkshakes are good. The fries are bad.') - print('doc1.similarity(doc2)', doc1.similarity(doc2)) - sent1a, sent1b = doc1.sents - print('sent1a.similarity(sent1b)', sent1a.similarity(sent1b)) - print('sent1a.similarity(doc2)', sent1a.similarity(doc2)) - print('sent1b.similarity(doc2)', sent1b.similarity(doc2)) - - -LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2} -def read_snli(loc): - with open(loc) as file_: - for line in file_: - eg = json.loads(line) - label = eg['gold_label'] - if label == '-': - continue - text1 = eg['sentence1'] - text2 = eg['sentence2'] - yield text1, text2, LABELS[label] - - -@plac.annotations( - mode=("Mode to execute", "positional", None, str, ["train", "evaluate", "demo"]), - model_dir=("Path to spaCy model directory", "positional", None, Path), - train_loc=("Path to training data", "positional", None, Path), - dev_loc=("Path to development data", "positional", None, Path), - max_length=("Length to truncate sentences", "option", "L", int), - nr_hidden=("Number of hidden units", "option", "H", int), - dropout=("Dropout level", "option", "d", float), - learn_rate=("Learning rate", "option", "e", float), - batch_size=("Batch size for neural network training", "option", "b", float), - nr_epoch=("Number of training epochs", "option", "i", float) -) -def main(mode, model_dir, train_loc, dev_loc, - max_length=100, - nr_hidden=100, - dropout=0.2, - learn_rate=0.001, - batch_size=100, - nr_epoch=5): - shape = (max_length, nr_hidden, 3) - settings = { - 'lr': learn_rate, - 'dropout': dropout, - 'batch_size': batch_size, - 'nr_epoch': nr_epoch - } - if mode == 'train': - train(model_dir, train_loc, dev_loc, shape, settings) - elif mode == 'evaluate': - evaluate(model_dir, dev_loc) - else: - demo(model_dir) - - -if __name__ == '__main__': - plac.call(main) diff --git a/examples/nli/keras_decomposable_attention.py b/examples/nli/keras_decomposable_attention.py deleted file mode 100644 index 21ecda447..000000000 --- a/examples/nli/keras_decomposable_attention.py +++ /dev/null @@ -1,217 +0,0 @@ -# Semantic similarity with decomposable attention (using spaCy and Keras) -# Practical state-of-the-art text similarity with spaCy and Keras -import numpy - -from keras.layers import InputSpec, Layer, Input, Dense, merge -from keras.layers import Activation, Dropout, Embedding, TimeDistributed -import keras.backend as K -import theano.tensor as T -from keras.models import Sequential, Model, model_from_json -from keras.regularizers import l2 -from keras.optimizers import Adam -from keras.layers.normalization import BatchNormalization - - -def build_model(vectors, shape, settings): - '''Compile the model.''' - max_length, nr_hidden, nr_class = shape - # Declare inputs. - ids1 = Input(shape=(max_length,), dtype='int32', name='words1') - ids2 = Input(shape=(max_length,), dtype='int32', name='words2') - - # Construct operations, which we'll chain together. - embed = _StaticEmbedding(vectors, max_length, nr_hidden) - attend = _Attention(max_length, nr_hidden) - align = _SoftAlignment(max_length, nr_hidden) - compare = _Comparison(max_length, nr_hidden) - entail = _Entailment(nr_hidden, nr_class) - - # Declare the model as a computational graph. - sent1 = embed(ids1) # Shape: (i, n) - sent2 = embed(ids2) # Shape: (j, n) - - attention = attend(sent1, sent2) # Shape: (i, j) - - align1 = align(sent2, attention) - align2 = align(sent1, attention, transpose=True) - - feats1 = compare(sent1, align1) - feats2 = compare(sent2, align2) - - scores = entail(feats1, feats2) - - # Now that we have the input/output, we can construct the Model object... - model = Model(input=[ids1, ids2], output=[scores]) - - # ...Compile it... - model.compile( - optimizer=Adam(lr=settings['lr']), - loss='categorical_crossentropy', - metrics=['accuracy']) - # ...And return it for training. - return model - - -class _StaticEmbedding(object): - def __init__(self, vectors, max_length, nr_out): - self.embed = Embedding( - vectors.shape[0], - vectors.shape[1], - input_length=max_length, - weights=[vectors], - name='embed', - trainable=False, - dropout=0.0) - - self.project = TimeDistributed( - Dense( - nr_out, - activation=None, - bias=False, - name='project')) - - def __call__(self, sentence): - return self.project(self.embed(sentence)) - - -class _Attention(object): - def __init__(self, max_length, nr_hidden, dropout=0.0, L2=1e-4, activation='relu'): - self.max_length = max_length - self.model = Sequential() - self.model.add( - Dense(nr_hidden, name='attend1', - init='he_normal', W_regularizer=l2(L2), - input_shape=(nr_hidden,), activation='relu')) - self.model.add(Dropout(dropout)) - self.model.add(Dense(nr_hidden, name='attend2', - init='he_normal', W_regularizer=l2(L2), activation='relu')) - self.model = TimeDistributed(self.model) - - def __call__(self, sent1, sent2): - def _outer((A, B)): - att_ji = T.batched_dot(B, A.dimshuffle((0, 2, 1))) - return att_ji.dimshuffle((0, 2, 1)) - - return merge( - [self.model(sent1), self.model(sent2)], - mode=_outer, - output_shape=(self.max_length, self.max_length)) - - -class _SoftAlignment(object): - def __init__(self, max_length, nr_hidden): - self.max_length = max_length - self.nr_hidden = nr_hidden - - def __call__(self, sentence, attention, transpose=False): - def _normalize_attention((att, mat)): - if transpose: - att = att.dimshuffle((0, 2, 1)) - # 3d softmax - e = K.exp(att - K.max(att, axis=-1, keepdims=True)) - s = K.sum(e, axis=-1, keepdims=True) - sm_att = e / s - return T.batched_dot(sm_att, mat) - return merge([attention, sentence], mode=_normalize_attention, - output_shape=(self.max_length, self.nr_hidden)) # Shape: (i, n) - - -class _Comparison(object): - def __init__(self, words, nr_hidden, L2=1e-6, dropout=0.2): - self.words = words - self.model = Sequential() - self.model.add(Dense(nr_hidden, name='compare1', - init='he_normal', W_regularizer=l2(L2), - input_shape=(nr_hidden*2,))) - self.model.add(Activation('relu')) - self.model.add(Dropout(dropout)) - self.model.add(Dense(nr_hidden, name='compare2', - W_regularizer=l2(L2), init='he_normal')) - self.model.add(Activation('relu')) - self.model.add(Dropout(dropout)) - self.model = TimeDistributed(self.model) - - def __call__(self, sent, align, **kwargs): - result = self.model(merge([sent, align], mode='concat')) # Shape: (i, n) - result = _GlobalSumPooling1D()(result, mask=self.words) - return result - - -class _Entailment(object): - def __init__(self, nr_hidden, nr_out, dropout=0.2, L2=1e-4): - self.model = Sequential() - self.model.add(Dense(nr_hidden, name='entail1', - init='he_normal', W_regularizer=l2(L2), - input_shape=(nr_hidden*2,))) - self.model.add(Activation('relu')) - self.model.add(Dropout(dropout)) - self.model.add(Dense(nr_out, name='entail_out', activation='softmax', - W_regularizer=l2(L2), init='zero')) - - def __call__(self, feats1, feats2): - features = merge([feats1, feats2], mode='concat') - return self.model(features) - - -class _GlobalSumPooling1D(Layer): - '''Global sum pooling operation for temporal data. - - # Input shape - 3D tensor with shape: `(samples, steps, features)`. - - # Output shape - 2D tensor with shape: `(samples, features)`. - ''' - def __init__(self, **kwargs): - super(_GlobalSumPooling1D, self).__init__(**kwargs) - self.input_spec = [InputSpec(ndim=3)] - - def get_output_shape_for(self, input_shape): - return (input_shape[0], input_shape[2]) - - def call(self, x, mask=None): - if mask is not None: - return K.sum(x * T.clip(mask, 0, 1), axis=1) - else: - return K.sum(x, axis=1) - - -def test_build_model(): - vectors = numpy.ndarray((100, 8), dtype='float32') - shape = (10, 16, 3) - settings = {'lr': 0.001, 'dropout': 0.2} - model = build_model(vectors, shape, settings) - - -def test_fit_model(): - def _generate_X(nr_example, length, nr_vector): - X1 = numpy.ndarray((nr_example, length), dtype='int32') - X1 *= X1 < nr_vector - X1 *= 0 <= X1 - X2 = numpy.ndarray((nr_example, length), dtype='int32') - X2 *= X2 < nr_vector - X2 *= 0 <= X2 - return [X1, X2] - def _generate_Y(nr_example, nr_class): - ys = numpy.zeros((nr_example, nr_class), dtype='int32') - for i in range(nr_example): - ys[i, i % nr_class] = 1 - return ys - - vectors = numpy.ndarray((100, 8), dtype='float32') - shape = (10, 16, 3) - settings = {'lr': 0.001, 'dropout': 0.2} - model = build_model(vectors, shape, settings) - - train_X = _generate_X(20, shape[0], vectors.shape[1]) - train_Y = _generate_Y(20, shape[2]) - dev_X = _generate_X(15, shape[0], vectors.shape[1]) - dev_Y = _generate_Y(15, shape[2]) - - model.fit(train_X, train_Y, validation_data=(dev_X, dev_Y), nb_epoch=5, - batch_size=4) - - - - -__all__ = [build_model] diff --git a/examples/nli/spacy_hook.py b/examples/nli/spacy_hook.py deleted file mode 100644 index 78e5ab71a..000000000 --- a/examples/nli/spacy_hook.py +++ /dev/null @@ -1,62 +0,0 @@ -from keras.models import model_from_json - - -class KerasSimilarityShim(object): - @classmethod - def load(cls, path, nlp, get_features=None): - if get_features is None: - get_features = doc2ids - with (path / 'config.json').open() as file_: - config = json.load(file_) - model = model_from_json(config['model']) - with (path / 'model').open('rb') as file_: - weights = pickle.load(file_) - embeddings = get_embeddings(nlp.vocab) - model.set_weights([embeddings] + weights) - return cls(model, get_features=get_features) - - def __init__(self, model, get_features=None): - self.model = model - self.get_features = get_features - - def __call__(self, doc): - doc.user_hooks['similarity'] = self.predict - doc.user_span_hooks['similarity'] = self.predict - - def predict(self, doc1, doc2): - x1 = self.get_features(doc1) - x2 = self.get_features(doc2) - scores = self.model.predict([x1, x2]) - return scores[0] - - -def get_embeddings(cls, vocab): - max_rank = max(lex.rank+1 for lex in vocab if lex.has_vector) - vectors = numpy.ndarray((max_rank+1, vocab.vectors_length), dtype='float32') - for lex in vocab: - if lex.has_vector: - vectors[lex.rank + 1] = lex.vector - return vectors - - -def get_word_ids(docs, max_length=100): - Xs = numpy.zeros((len(docs), max_length), dtype='int32') - for i, doc in enumerate(docs): - j = 0 - for token in doc: - if token.has_vector and not token.is_punct and not token.is_space: - Xs[i, j] = token.rank + 1 - j += 1 - if j >= max_length: - break - return Xs - - -def create_similarity_pipeline(nlp): - return [SimilarityModel.load( - nlp.path / 'similarity', - nlp, - feature_extracter=get_features)] - - -