mirror of https://github.com/explosion/spaCy.git
Rename entailment example
This commit is contained in:
parent
1b9c6240a7
commit
0b7af54219
|
@ -1,77 +0,0 @@
|
||||||
# A Decomposable Attention Model for Natural Language Inference
|
|
||||||
|
|
||||||
This directory contains an implementation of entailment prediction model described
|
|
||||||
by Parikh et al. (2016). The model is notable for its competitive performance
|
|
||||||
with very few parameters.
|
|
||||||
|
|
||||||
https://arxiv.org/pdf/1606.01933.pdf
|
|
||||||
|
|
||||||
The model is implemented using Keras and spaCy. Keras is used to build and
|
|
||||||
train the network, while spaCy is used to load the GloVe vectors, perform the
|
|
||||||
feature extraction, and help you apply the model at run-time. The following
|
|
||||||
demo code shows how the entailment model can be used at runtime, once the hook is
|
|
||||||
installed to customise the `.similarity()` method of spaCy's `Doc` and `Span`
|
|
||||||
objects:
|
|
||||||
|
|
||||||
def demo(model_dir):
|
|
||||||
nlp = spacy.load('en', path=model_dir,
|
|
||||||
create_pipeline=create_similarity_pipeline)
|
|
||||||
doc1 = nlp(u'Worst fries ever! Greasy and horrible...')
|
|
||||||
doc2 = nlp(u'The milkshakes are good. The fries are bad.')
|
|
||||||
print(doc1.similarity(doc2))
|
|
||||||
sent1a, sent1b = doc1.sents
|
|
||||||
print(sent1a.similarity(sent1b))
|
|
||||||
print(sent1a.similarity(doc2))
|
|
||||||
print(sent1b.similarity(doc2))
|
|
||||||
|
|
||||||
I'm working on a blog post to explain Parikh et al.'s model in more detail.
|
|
||||||
I think it is a very interesting example of the attention mechanism, which
|
|
||||||
I didn't understand very well before working through this paper.
|
|
||||||
|
|
||||||
# How to run the example
|
|
||||||
|
|
||||||
1. Install spaCy and its English models (about 1GB of data):
|
|
||||||
|
|
||||||
pip install spacy
|
|
||||||
python -m spacy.en.download
|
|
||||||
|
|
||||||
This will give you the spaCy's tokenization, tagging, NER and parsing models,
|
|
||||||
as well as the GloVe word vectors.
|
|
||||||
|
|
||||||
2. Install Keras
|
|
||||||
|
|
||||||
pip install keras
|
|
||||||
|
|
||||||
3. Get Keras working with your GPU
|
|
||||||
|
|
||||||
You're mostly on your own here. My only advice is, if you're setting up on AWS,
|
|
||||||
try using the AMI published by NVidia. With the image, getting everything set
|
|
||||||
up wasn't *too* painful.
|
|
||||||
|
|
||||||
4. Test the Keras model:
|
|
||||||
|
|
||||||
py.test nli/keras_decomposable_attention.py
|
|
||||||
|
|
||||||
This should tell you that two tests passed.
|
|
||||||
|
|
||||||
5. Download the Stanford Natural Language Inference data
|
|
||||||
|
|
||||||
http://nlp.stanford.edu/projects/snli/
|
|
||||||
|
|
||||||
6. Train the model:
|
|
||||||
|
|
||||||
python nli/ train <your_model_dir> <train_directory> <dev_directory>
|
|
||||||
|
|
||||||
Training takes about 300 epochs for full accuracy, and I haven't rerun the full
|
|
||||||
experiment since refactoring things to publish this example --- please let me
|
|
||||||
know if I've broken something.
|
|
||||||
|
|
||||||
You should get to at least 85% on the development data.
|
|
||||||
|
|
||||||
7. Evaluate the model (optional):
|
|
||||||
|
|
||||||
python nli/ evaluate <your_model_dir> <dev_directory>
|
|
||||||
|
|
||||||
8. Run the demo (optional):
|
|
||||||
|
|
||||||
python nli/ demo <your_model_dir>
|
|
|
@ -1,105 +0,0 @@
|
||||||
from __future__ import division, unicode_literals, print_function
|
|
||||||
import spacy
|
|
||||||
|
|
||||||
import plac
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from spacy_hook import get_embeddings, get_word_ids
|
|
||||||
from spacy_hook import create_similarity_pipeline
|
|
||||||
|
|
||||||
|
|
||||||
def train(model_dir, train_loc, dev_loc, shape, settings):
|
|
||||||
print("Loading spaCy")
|
|
||||||
nlp = spacy.load('en', tagger=False, parser=False, entity=False, matcher=False)
|
|
||||||
print("Compiling network")
|
|
||||||
model = build_model(get_embeddings(nlp.vocab), shape, settings)
|
|
||||||
print("Processing texts...")
|
|
||||||
train_X = get_features(list(nlp.pipe(train_texts)))
|
|
||||||
dev_X = get_features(list(nlp.pipe(dev_texts)))
|
|
||||||
|
|
||||||
model.fit(
|
|
||||||
train_X,
|
|
||||||
train_labels,
|
|
||||||
validation_data=(dev_X, dev_labels),
|
|
||||||
nb_epoch=settings['nr_epoch'],
|
|
||||||
batch_size=settings['batch_size'])
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(model_dir, dev_loc):
|
|
||||||
nlp = spacy.load('en', path=model_dir,
|
|
||||||
tagger=False, parser=False, entity=False, matcher=False,
|
|
||||||
create_pipeline=create_similarity_pipeline)
|
|
||||||
n = 0
|
|
||||||
correct = 0
|
|
||||||
for (text1, text2), label in zip(dev_texts, dev_labels):
|
|
||||||
doc1 = nlp(text1)
|
|
||||||
doc2 = nlp(text2)
|
|
||||||
sim = doc1.similarity(doc2)
|
|
||||||
if bool(sim >= 0.5) == label:
|
|
||||||
correct += 1
|
|
||||||
n += 1
|
|
||||||
return correct, total
|
|
||||||
|
|
||||||
|
|
||||||
def demo(model_dir):
|
|
||||||
nlp = spacy.load('en', path=model_dir,
|
|
||||||
tagger=False, parser=False, entity=False, matcher=False,
|
|
||||||
create_pipeline=create_similarity_pipeline)
|
|
||||||
doc1 = nlp(u'Worst fries ever! Greasy and horrible...')
|
|
||||||
doc2 = nlp(u'The milkshakes are good. The fries are bad.')
|
|
||||||
print('doc1.similarity(doc2)', doc1.similarity(doc2))
|
|
||||||
sent1a, sent1b = doc1.sents
|
|
||||||
print('sent1a.similarity(sent1b)', sent1a.similarity(sent1b))
|
|
||||||
print('sent1a.similarity(doc2)', sent1a.similarity(doc2))
|
|
||||||
print('sent1b.similarity(doc2)', sent1b.similarity(doc2))
|
|
||||||
|
|
||||||
|
|
||||||
LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
|
|
||||||
def read_snli(loc):
|
|
||||||
with open(loc) as file_:
|
|
||||||
for line in file_:
|
|
||||||
eg = json.loads(line)
|
|
||||||
label = eg['gold_label']
|
|
||||||
if label == '-':
|
|
||||||
continue
|
|
||||||
text1 = eg['sentence1']
|
|
||||||
text2 = eg['sentence2']
|
|
||||||
yield text1, text2, LABELS[label]
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
mode=("Mode to execute", "positional", None, str, ["train", "evaluate", "demo"]),
|
|
||||||
model_dir=("Path to spaCy model directory", "positional", None, Path),
|
|
||||||
train_loc=("Path to training data", "positional", None, Path),
|
|
||||||
dev_loc=("Path to development data", "positional", None, Path),
|
|
||||||
max_length=("Length to truncate sentences", "option", "L", int),
|
|
||||||
nr_hidden=("Number of hidden units", "option", "H", int),
|
|
||||||
dropout=("Dropout level", "option", "d", float),
|
|
||||||
learn_rate=("Learning rate", "option", "e", float),
|
|
||||||
batch_size=("Batch size for neural network training", "option", "b", float),
|
|
||||||
nr_epoch=("Number of training epochs", "option", "i", float)
|
|
||||||
)
|
|
||||||
def main(mode, model_dir, train_loc, dev_loc,
|
|
||||||
max_length=100,
|
|
||||||
nr_hidden=100,
|
|
||||||
dropout=0.2,
|
|
||||||
learn_rate=0.001,
|
|
||||||
batch_size=100,
|
|
||||||
nr_epoch=5):
|
|
||||||
shape = (max_length, nr_hidden, 3)
|
|
||||||
settings = {
|
|
||||||
'lr': learn_rate,
|
|
||||||
'dropout': dropout,
|
|
||||||
'batch_size': batch_size,
|
|
||||||
'nr_epoch': nr_epoch
|
|
||||||
}
|
|
||||||
if mode == 'train':
|
|
||||||
train(model_dir, train_loc, dev_loc, shape, settings)
|
|
||||||
elif mode == 'evaluate':
|
|
||||||
evaluate(model_dir, dev_loc)
|
|
||||||
else:
|
|
||||||
demo(model_dir)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,217 +0,0 @@
|
||||||
# Semantic similarity with decomposable attention (using spaCy and Keras)
|
|
||||||
# Practical state-of-the-art text similarity with spaCy and Keras
|
|
||||||
import numpy
|
|
||||||
|
|
||||||
from keras.layers import InputSpec, Layer, Input, Dense, merge
|
|
||||||
from keras.layers import Activation, Dropout, Embedding, TimeDistributed
|
|
||||||
import keras.backend as K
|
|
||||||
import theano.tensor as T
|
|
||||||
from keras.models import Sequential, Model, model_from_json
|
|
||||||
from keras.regularizers import l2
|
|
||||||
from keras.optimizers import Adam
|
|
||||||
from keras.layers.normalization import BatchNormalization
|
|
||||||
|
|
||||||
|
|
||||||
def build_model(vectors, shape, settings):
|
|
||||||
'''Compile the model.'''
|
|
||||||
max_length, nr_hidden, nr_class = shape
|
|
||||||
# Declare inputs.
|
|
||||||
ids1 = Input(shape=(max_length,), dtype='int32', name='words1')
|
|
||||||
ids2 = Input(shape=(max_length,), dtype='int32', name='words2')
|
|
||||||
|
|
||||||
# Construct operations, which we'll chain together.
|
|
||||||
embed = _StaticEmbedding(vectors, max_length, nr_hidden)
|
|
||||||
attend = _Attention(max_length, nr_hidden)
|
|
||||||
align = _SoftAlignment(max_length, nr_hidden)
|
|
||||||
compare = _Comparison(max_length, nr_hidden)
|
|
||||||
entail = _Entailment(nr_hidden, nr_class)
|
|
||||||
|
|
||||||
# Declare the model as a computational graph.
|
|
||||||
sent1 = embed(ids1) # Shape: (i, n)
|
|
||||||
sent2 = embed(ids2) # Shape: (j, n)
|
|
||||||
|
|
||||||
attention = attend(sent1, sent2) # Shape: (i, j)
|
|
||||||
|
|
||||||
align1 = align(sent2, attention)
|
|
||||||
align2 = align(sent1, attention, transpose=True)
|
|
||||||
|
|
||||||
feats1 = compare(sent1, align1)
|
|
||||||
feats2 = compare(sent2, align2)
|
|
||||||
|
|
||||||
scores = entail(feats1, feats2)
|
|
||||||
|
|
||||||
# Now that we have the input/output, we can construct the Model object...
|
|
||||||
model = Model(input=[ids1, ids2], output=[scores])
|
|
||||||
|
|
||||||
# ...Compile it...
|
|
||||||
model.compile(
|
|
||||||
optimizer=Adam(lr=settings['lr']),
|
|
||||||
loss='categorical_crossentropy',
|
|
||||||
metrics=['accuracy'])
|
|
||||||
# ...And return it for training.
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
class _StaticEmbedding(object):
|
|
||||||
def __init__(self, vectors, max_length, nr_out):
|
|
||||||
self.embed = Embedding(
|
|
||||||
vectors.shape[0],
|
|
||||||
vectors.shape[1],
|
|
||||||
input_length=max_length,
|
|
||||||
weights=[vectors],
|
|
||||||
name='embed',
|
|
||||||
trainable=False,
|
|
||||||
dropout=0.0)
|
|
||||||
|
|
||||||
self.project = TimeDistributed(
|
|
||||||
Dense(
|
|
||||||
nr_out,
|
|
||||||
activation=None,
|
|
||||||
bias=False,
|
|
||||||
name='project'))
|
|
||||||
|
|
||||||
def __call__(self, sentence):
|
|
||||||
return self.project(self.embed(sentence))
|
|
||||||
|
|
||||||
|
|
||||||
class _Attention(object):
|
|
||||||
def __init__(self, max_length, nr_hidden, dropout=0.0, L2=1e-4, activation='relu'):
|
|
||||||
self.max_length = max_length
|
|
||||||
self.model = Sequential()
|
|
||||||
self.model.add(
|
|
||||||
Dense(nr_hidden, name='attend1',
|
|
||||||
init='he_normal', W_regularizer=l2(L2),
|
|
||||||
input_shape=(nr_hidden,), activation='relu'))
|
|
||||||
self.model.add(Dropout(dropout))
|
|
||||||
self.model.add(Dense(nr_hidden, name='attend2',
|
|
||||||
init='he_normal', W_regularizer=l2(L2), activation='relu'))
|
|
||||||
self.model = TimeDistributed(self.model)
|
|
||||||
|
|
||||||
def __call__(self, sent1, sent2):
|
|
||||||
def _outer((A, B)):
|
|
||||||
att_ji = T.batched_dot(B, A.dimshuffle((0, 2, 1)))
|
|
||||||
return att_ji.dimshuffle((0, 2, 1))
|
|
||||||
|
|
||||||
return merge(
|
|
||||||
[self.model(sent1), self.model(sent2)],
|
|
||||||
mode=_outer,
|
|
||||||
output_shape=(self.max_length, self.max_length))
|
|
||||||
|
|
||||||
|
|
||||||
class _SoftAlignment(object):
|
|
||||||
def __init__(self, max_length, nr_hidden):
|
|
||||||
self.max_length = max_length
|
|
||||||
self.nr_hidden = nr_hidden
|
|
||||||
|
|
||||||
def __call__(self, sentence, attention, transpose=False):
|
|
||||||
def _normalize_attention((att, mat)):
|
|
||||||
if transpose:
|
|
||||||
att = att.dimshuffle((0, 2, 1))
|
|
||||||
# 3d softmax
|
|
||||||
e = K.exp(att - K.max(att, axis=-1, keepdims=True))
|
|
||||||
s = K.sum(e, axis=-1, keepdims=True)
|
|
||||||
sm_att = e / s
|
|
||||||
return T.batched_dot(sm_att, mat)
|
|
||||||
return merge([attention, sentence], mode=_normalize_attention,
|
|
||||||
output_shape=(self.max_length, self.nr_hidden)) # Shape: (i, n)
|
|
||||||
|
|
||||||
|
|
||||||
class _Comparison(object):
|
|
||||||
def __init__(self, words, nr_hidden, L2=1e-6, dropout=0.2):
|
|
||||||
self.words = words
|
|
||||||
self.model = Sequential()
|
|
||||||
self.model.add(Dense(nr_hidden, name='compare1',
|
|
||||||
init='he_normal', W_regularizer=l2(L2),
|
|
||||||
input_shape=(nr_hidden*2,)))
|
|
||||||
self.model.add(Activation('relu'))
|
|
||||||
self.model.add(Dropout(dropout))
|
|
||||||
self.model.add(Dense(nr_hidden, name='compare2',
|
|
||||||
W_regularizer=l2(L2), init='he_normal'))
|
|
||||||
self.model.add(Activation('relu'))
|
|
||||||
self.model.add(Dropout(dropout))
|
|
||||||
self.model = TimeDistributed(self.model)
|
|
||||||
|
|
||||||
def __call__(self, sent, align, **kwargs):
|
|
||||||
result = self.model(merge([sent, align], mode='concat')) # Shape: (i, n)
|
|
||||||
result = _GlobalSumPooling1D()(result, mask=self.words)
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
class _Entailment(object):
|
|
||||||
def __init__(self, nr_hidden, nr_out, dropout=0.2, L2=1e-4):
|
|
||||||
self.model = Sequential()
|
|
||||||
self.model.add(Dense(nr_hidden, name='entail1',
|
|
||||||
init='he_normal', W_regularizer=l2(L2),
|
|
||||||
input_shape=(nr_hidden*2,)))
|
|
||||||
self.model.add(Activation('relu'))
|
|
||||||
self.model.add(Dropout(dropout))
|
|
||||||
self.model.add(Dense(nr_out, name='entail_out', activation='softmax',
|
|
||||||
W_regularizer=l2(L2), init='zero'))
|
|
||||||
|
|
||||||
def __call__(self, feats1, feats2):
|
|
||||||
features = merge([feats1, feats2], mode='concat')
|
|
||||||
return self.model(features)
|
|
||||||
|
|
||||||
|
|
||||||
class _GlobalSumPooling1D(Layer):
|
|
||||||
'''Global sum pooling operation for temporal data.
|
|
||||||
|
|
||||||
# Input shape
|
|
||||||
3D tensor with shape: `(samples, steps, features)`.
|
|
||||||
|
|
||||||
# Output shape
|
|
||||||
2D tensor with shape: `(samples, features)`.
|
|
||||||
'''
|
|
||||||
def __init__(self, **kwargs):
|
|
||||||
super(_GlobalSumPooling1D, self).__init__(**kwargs)
|
|
||||||
self.input_spec = [InputSpec(ndim=3)]
|
|
||||||
|
|
||||||
def get_output_shape_for(self, input_shape):
|
|
||||||
return (input_shape[0], input_shape[2])
|
|
||||||
|
|
||||||
def call(self, x, mask=None):
|
|
||||||
if mask is not None:
|
|
||||||
return K.sum(x * T.clip(mask, 0, 1), axis=1)
|
|
||||||
else:
|
|
||||||
return K.sum(x, axis=1)
|
|
||||||
|
|
||||||
|
|
||||||
def test_build_model():
|
|
||||||
vectors = numpy.ndarray((100, 8), dtype='float32')
|
|
||||||
shape = (10, 16, 3)
|
|
||||||
settings = {'lr': 0.001, 'dropout': 0.2}
|
|
||||||
model = build_model(vectors, shape, settings)
|
|
||||||
|
|
||||||
|
|
||||||
def test_fit_model():
|
|
||||||
def _generate_X(nr_example, length, nr_vector):
|
|
||||||
X1 = numpy.ndarray((nr_example, length), dtype='int32')
|
|
||||||
X1 *= X1 < nr_vector
|
|
||||||
X1 *= 0 <= X1
|
|
||||||
X2 = numpy.ndarray((nr_example, length), dtype='int32')
|
|
||||||
X2 *= X2 < nr_vector
|
|
||||||
X2 *= 0 <= X2
|
|
||||||
return [X1, X2]
|
|
||||||
def _generate_Y(nr_example, nr_class):
|
|
||||||
ys = numpy.zeros((nr_example, nr_class), dtype='int32')
|
|
||||||
for i in range(nr_example):
|
|
||||||
ys[i, i % nr_class] = 1
|
|
||||||
return ys
|
|
||||||
|
|
||||||
vectors = numpy.ndarray((100, 8), dtype='float32')
|
|
||||||
shape = (10, 16, 3)
|
|
||||||
settings = {'lr': 0.001, 'dropout': 0.2}
|
|
||||||
model = build_model(vectors, shape, settings)
|
|
||||||
|
|
||||||
train_X = _generate_X(20, shape[0], vectors.shape[1])
|
|
||||||
train_Y = _generate_Y(20, shape[2])
|
|
||||||
dev_X = _generate_X(15, shape[0], vectors.shape[1])
|
|
||||||
dev_Y = _generate_Y(15, shape[2])
|
|
||||||
|
|
||||||
model.fit(train_X, train_Y, validation_data=(dev_X, dev_Y), nb_epoch=5,
|
|
||||||
batch_size=4)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = [build_model]
|
|
|
@ -1,62 +0,0 @@
|
||||||
from keras.models import model_from_json
|
|
||||||
|
|
||||||
|
|
||||||
class KerasSimilarityShim(object):
|
|
||||||
@classmethod
|
|
||||||
def load(cls, path, nlp, get_features=None):
|
|
||||||
if get_features is None:
|
|
||||||
get_features = doc2ids
|
|
||||||
with (path / 'config.json').open() as file_:
|
|
||||||
config = json.load(file_)
|
|
||||||
model = model_from_json(config['model'])
|
|
||||||
with (path / 'model').open('rb') as file_:
|
|
||||||
weights = pickle.load(file_)
|
|
||||||
embeddings = get_embeddings(nlp.vocab)
|
|
||||||
model.set_weights([embeddings] + weights)
|
|
||||||
return cls(model, get_features=get_features)
|
|
||||||
|
|
||||||
def __init__(self, model, get_features=None):
|
|
||||||
self.model = model
|
|
||||||
self.get_features = get_features
|
|
||||||
|
|
||||||
def __call__(self, doc):
|
|
||||||
doc.user_hooks['similarity'] = self.predict
|
|
||||||
doc.user_span_hooks['similarity'] = self.predict
|
|
||||||
|
|
||||||
def predict(self, doc1, doc2):
|
|
||||||
x1 = self.get_features(doc1)
|
|
||||||
x2 = self.get_features(doc2)
|
|
||||||
scores = self.model.predict([x1, x2])
|
|
||||||
return scores[0]
|
|
||||||
|
|
||||||
|
|
||||||
def get_embeddings(cls, vocab):
|
|
||||||
max_rank = max(lex.rank+1 for lex in vocab if lex.has_vector)
|
|
||||||
vectors = numpy.ndarray((max_rank+1, vocab.vectors_length), dtype='float32')
|
|
||||||
for lex in vocab:
|
|
||||||
if lex.has_vector:
|
|
||||||
vectors[lex.rank + 1] = lex.vector
|
|
||||||
return vectors
|
|
||||||
|
|
||||||
|
|
||||||
def get_word_ids(docs, max_length=100):
|
|
||||||
Xs = numpy.zeros((len(docs), max_length), dtype='int32')
|
|
||||||
for i, doc in enumerate(docs):
|
|
||||||
j = 0
|
|
||||||
for token in doc:
|
|
||||||
if token.has_vector and not token.is_punct and not token.is_space:
|
|
||||||
Xs[i, j] = token.rank + 1
|
|
||||||
j += 1
|
|
||||||
if j >= max_length:
|
|
||||||
break
|
|
||||||
return Xs
|
|
||||||
|
|
||||||
|
|
||||||
def create_similarity_pipeline(nlp):
|
|
||||||
return [SimilarityModel.load(
|
|
||||||
nlp.path / 'similarity',
|
|
||||||
nlp,
|
|
||||||
feature_extracter=get_features)]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue