diff --git a/examples/keras_parikh_entailment/__main__.py b/examples/keras_parikh_entailment/__main__.py index 6886226bb..20a02937d 100644 --- a/examples/keras_parikh_entailment/__main__.py +++ b/examples/keras_parikh_entailment/__main__.py @@ -93,7 +93,7 @@ def read_snli(path): nr_hidden=("Number of hidden units", "option", "H", int), dropout=("Dropout level", "option", "d", float), learn_rate=("Learning rate", "option", "e", float), - batch_size=("Batch size for neural network training", "option", "b", float), + batch_size=("Batch size for neural network training", "option", "b", int), nr_epoch=("Number of training epochs", "option", "i", int), tree_truncate=("Truncate sentences by tree distance", "flag", "T", bool), gru_encode=("Encode sentences with bidirectional GRU", "flag", "E", bool), diff --git a/examples/keras_parikh_entailment/keras_decomposable_attention.py b/examples/keras_parikh_entailment/keras_decomposable_attention.py index 84663cf17..eb573f089 100644 --- a/examples/keras_parikh_entailment/keras_decomposable_attention.py +++ b/examples/keras_parikh_entailment/keras_decomposable_attention.py @@ -3,8 +3,10 @@ import numpy from keras.layers import InputSpec, Layer, Input, Dense, merge -from keras.layers import Activation, Dropout, Embedding, TimeDistributed -from keras.layers import Bidirectional, GRU +from keras.layers import Lambda, Activation, Dropout, Embedding, TimeDistributed +from keras.layers import Bidirectional, GRU, LSTM +from keras.layers.noise import GaussianNoise +from keras.layers.advanced_activations import ELU import keras.backend as K from keras.models import Sequential, Model, model_from_json from keras.regularizers import l2 @@ -20,13 +22,13 @@ def build_model(vectors, shape, settings): ids2 = Input(shape=(max_length,), dtype='int32', name='words2') # Construct operations, which we'll chain together. - embed = _StaticEmbedding(vectors, max_length, nr_hidden) + embed = _StaticEmbedding(vectors, max_length, nr_hidden, dropout=0.2, nr_tune=5000) if settings['gru_encode']: - encode = _BiRNNEncoding(max_length, nr_hidden) - attend = _Attention(max_length, nr_hidden) + encode = _BiRNNEncoding(max_length, nr_hidden, dropout=settings['dropout']) + attend = _Attention(max_length, nr_hidden, dropout=settings['dropout']) align = _SoftAlignment(max_length, nr_hidden) - compare = _Comparison(max_length, nr_hidden) - entail = _Entailment(nr_hidden, nr_class) + compare = _Comparison(max_length, nr_hidden, dropout=settings['dropout']) + entail = _Entailment(nr_hidden, nr_class, dropout=settings['dropout']) # Declare the model as a computational graph. sent1 = embed(ids1) # Shape: (i, n) @@ -59,15 +61,26 @@ def build_model(vectors, shape, settings): class _StaticEmbedding(object): - def __init__(self, vectors, max_length, nr_out): + def __init__(self, vectors, max_length, nr_out, nr_tune=1000, dropout=0.0): + self.nr_out = nr_out + self.max_length = max_length self.embed = Embedding( vectors.shape[0], vectors.shape[1], input_length=max_length, weights=[vectors], name='embed', - trainable=False, - dropout=0.0) + trainable=False) + self.tune = Embedding( + nr_tune, + nr_out, + input_length=max_length, + weights=None, + name='tune', + trainable=True, + dropout=dropout) + self.mod_ids = Lambda(lambda sent: sent % (nr_tune-1)+1, + output_shape=(self.max_length,)) self.project = TimeDistributed( Dense( @@ -77,23 +90,37 @@ class _StaticEmbedding(object): name='project')) def __call__(self, sentence): - return self.project(self.embed(sentence)) + def get_output_shape(shapes): + print(shapes) + return shapes[0] + mod_sent = self.mod_ids(sentence) + tuning = self.tune(mod_sent) + #tuning = merge([tuning, mod_sent], + # mode=lambda AB: AB[0] * (K.clip(K.cast(AB[1], 'float32'), 0, 1)), + # output_shape=(self.max_length, self.nr_out)) + pretrained = self.project(self.embed(sentence)) + vectors = merge([pretrained, tuning], mode='sum') + return vectors class _BiRNNEncoding(object): - def __init__(self, max_length, nr_out): + def __init__(self, max_length, nr_out, dropout=0.0): self.model = Sequential() - self.model.add(Bidirectional(GRU(int(nr_out/2), return_sequences=True), - input_shape=(max_length, nr_out))) + self.model.add(Bidirectional(LSTM(nr_out, return_sequences=True, + dropout_W=dropout, dropout_U=dropout), + input_shape=(max_length, nr_out))) + self.model.add(TimeDistributed(Dense(nr_out, activation='relu', init='he_normal'))) + self.model.add(TimeDistributed(Dropout(0.2))) def __call__(self, sentence): return self.model(sentence) class _Attention(object): - def __init__(self, max_length, nr_hidden, dropout=0.0, L2=1e-4, activation='relu'): + def __init__(self, max_length, nr_hidden, dropout=0.0, L2=0.0, activation='relu'): self.max_length = max_length self.model = Sequential() + self.model.add(Dropout(dropout, input_shape=(nr_hidden,))) self.model.add( Dense(nr_hidden, name='attend1', init='he_normal', W_regularizer=l2(L2), @@ -134,18 +161,17 @@ class _SoftAlignment(object): class _Comparison(object): - def __init__(self, words, nr_hidden, L2=1e-6, dropout=0.2): + def __init__(self, words, nr_hidden, L2=0.0, dropout=0.0): self.words = words self.model = Sequential() + self.model.add(Dropout(dropout, input_shape=(nr_hidden*2,))) self.model.add(Dense(nr_hidden, name='compare1', - init='he_normal', W_regularizer=l2(L2), - input_shape=(nr_hidden*2,))) + init='he_normal', W_regularizer=l2(L2))) self.model.add(Activation('relu')) self.model.add(Dropout(dropout)) self.model.add(Dense(nr_hidden, name='compare2', W_regularizer=l2(L2), init='he_normal')) self.model.add(Activation('relu')) - self.model.add(Dropout(dropout)) self.model = TimeDistributed(self.model) def __call__(self, sent, align, **kwargs): @@ -156,13 +182,16 @@ class _Comparison(object): class _Entailment(object): - def __init__(self, nr_hidden, nr_out, dropout=0.2, L2=1e-4): + def __init__(self, nr_hidden, nr_out, dropout=0.0, L2=0.0): self.model = Sequential() + self.model.add(Dropout(dropout, input_shape=(nr_hidden*2,))) self.model.add(Dense(nr_hidden, name='entail1', - init='he_normal', W_regularizer=l2(L2), - input_shape=(nr_hidden*2,))) + init='he_normal', W_regularizer=l2(L2))) self.model.add(Activation('relu')) self.model.add(Dropout(dropout)) + self.model.add(Dense(nr_hidden, name='entail2', + init='he_normal', W_regularizer=l2(L2))) + self.model.add(Activation('relu')) self.model.add(Dense(nr_out, name='entail_out', activation='softmax', W_regularizer=l2(L2), init='zero')) diff --git a/examples/keras_parikh_entailment/spacy_hook.py b/examples/keras_parikh_entailment/spacy_hook.py index 082e39ba9..c5c64f0fd 100644 --- a/examples/keras_parikh_entailment/spacy_hook.py +++ b/examples/keras_parikh_entailment/spacy_hook.py @@ -1,5 +1,6 @@ from keras.models import model_from_json import numpy +import numpy.random class KerasSimilarityShim(object): @@ -31,16 +32,16 @@ class KerasSimilarityShim(object): return scores[0] -def get_embeddings(vocab): - max_rank = max(lex.rank+1 for lex in vocab if lex.has_vector) - vectors = numpy.ndarray((max_rank+1, vocab.vectors_length), dtype='float32') +def get_embeddings(vocab, nr_unk=100): + nr_vector = max(lex.rank for lex in vocab) + 1 + vectors = numpy.zeros((nr_vector+nr_unk+2, vocab.vectors_length), dtype='float32') for lex in vocab: if lex.has_vector: - vectors[lex.rank + 1] = lex.vector + vectors[lex.rank+1] = lex.vector / lex.vector_norm return vectors -def get_word_ids(docs, rnn_encode=False, tree_truncate=False, max_length=100): +def get_word_ids(docs, rnn_encode=False, tree_truncate=False, max_length=100, nr_unk=100): Xs = numpy.zeros((len(docs), max_length), dtype='int32') for i, doc in enumerate(docs): if tree_truncate: @@ -50,17 +51,22 @@ def get_word_ids(docs, rnn_encode=False, tree_truncate=False, max_length=100): words = [] while len(words) <= max_length and queue: word = queue.pop(0) - if rnn_encode or (word.has_vector and not word.is_punct and not word.is_space): + if rnn_encode or (not word.is_punct and not word.is_space): words.append(word) if tree_truncate: queue.extend(list(word.lefts)) queue.extend(list(word.rights)) words.sort() for j, token in enumerate(words): - Xs[i, j] = token.rank + 1 + if token.has_vector: + Xs[i, j] = token.rank+1 + else: + Xs[i, j] = (token.shape % (nr_unk-1))+2 j += 1 if j >= max_length: break + else: + Xs[i, len(words)] = 1 return Xs