spaCy/examples/keras_parikh_entailment/__main__.py

208 lines
6.3 KiB
Python

import numpy as np
import json
from keras.utils import to_categorical
import plac
import sys
from keras_decomposable_attention import build_model
from spacy_hook import get_embeddings, KerasSimilarityShim
try:
import cPickle as pickle
except ImportError:
import pickle
import spacy
# workaround for keras/tensorflow bug
# see https://github.com/tensorflow/tensorflow/issues/3388
import os
import importlib
from keras import backend as K
def set_keras_backend(backend):
if K.backend() != backend:
os.environ["KERAS_BACKEND"] = backend
importlib.reload(K)
assert K.backend() == backend
if backend == "tensorflow":
K.get_session().close()
cfg = K.tf.ConfigProto()
cfg.gpu_options.allow_growth = True
K.set_session(K.tf.Session(config=cfg))
K.clear_session()
set_keras_backend("tensorflow")
def train(train_loc, dev_loc, shape, settings):
train_texts1, train_texts2, train_labels = read_snli(train_loc)
dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
print("Loading spaCy")
nlp = spacy.load("en_vectors_web_lg")
assert nlp.path is not None
print("Processing texts...")
train_X = create_dataset(nlp, train_texts1, train_texts2, 100, shape[0])
dev_X = create_dataset(nlp, dev_texts1, dev_texts2, 100, shape[0])
print("Compiling network")
model = build_model(get_embeddings(nlp.vocab), shape, settings)
print(settings)
model.fit(
train_X,
train_labels,
validation_data=(dev_X, dev_labels),
epochs=settings["nr_epoch"],
batch_size=settings["batch_size"],
)
if not (nlp.path / "similarity").exists():
(nlp.path / "similarity").mkdir()
print("Saving to", nlp.path / "similarity")
weights = model.get_weights()
# remove the embedding matrix. We can reconstruct it.
del weights[1]
with (nlp.path / "similarity" / "model").open("wb") as file_:
pickle.dump(weights, file_)
with (nlp.path / "similarity" / "config.json").open("w") as file_:
file_.write(model.to_json())
def evaluate(dev_loc, shape):
dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
nlp = spacy.load("en_vectors_web_lg")
nlp.add_pipe(KerasSimilarityShim.load(nlp.path / "similarity", nlp, shape[0]))
total = 0.0
correct = 0.0
for text1, text2, label in zip(dev_texts1, dev_texts2, dev_labels):
doc1 = nlp(text1)
doc2 = nlp(text2)
sim, _ = doc1.similarity(doc2)
if sim == KerasSimilarityShim.entailment_types[label.argmax()]:
correct += 1
total += 1
return correct, total
def demo(shape):
nlp = spacy.load("en_vectors_web_lg")
nlp.add_pipe(KerasSimilarityShim.load(nlp.path / "similarity", nlp, shape[0]))
doc1 = nlp(u"The king of France is bald.")
doc2 = nlp(u"France has no king.")
print("Sentence 1:", doc1)
print("Sentence 2:", doc2)
entailment_type, confidence = doc1.similarity(doc2)
print("Entailment type:", entailment_type, "(Confidence:", confidence, ")")
LABELS = {"entailment": 0, "contradiction": 1, "neutral": 2}
def read_snli(path):
texts1 = []
texts2 = []
labels = []
with open(path, "r") as file_:
for line in file_:
eg = json.loads(line)
label = eg["gold_label"]
if label == "-": # per Parikh, ignore - SNLI entries
continue
texts1.append(eg["sentence1"])
texts2.append(eg["sentence2"])
labels.append(LABELS[label])
return texts1, texts2, to_categorical(np.asarray(labels, dtype="int32"))
def create_dataset(nlp, texts, hypotheses, num_unk, max_length):
sents = texts + hypotheses
sents_as_ids = []
for sent in sents:
doc = nlp(sent)
word_ids = []
for i, token in enumerate(doc):
# skip odd spaces from tokenizer
if token.has_vector and token.vector_norm == 0:
continue
if i > max_length:
break
if token.has_vector:
word_ids.append(token.rank + num_unk + 1)
else:
# if we don't have a vector, pick an OOV entry
word_ids.append(token.rank % num_unk + 1)
# there must be a simpler way of generating padded arrays from lists...
word_id_vec = np.zeros((max_length), dtype="int")
clipped_len = min(max_length, len(word_ids))
word_id_vec[:clipped_len] = word_ids[:clipped_len]
sents_as_ids.append(word_id_vec)
return [np.array(sents_as_ids[: len(texts)]), np.array(sents_as_ids[len(texts) :])]
@plac.annotations(
mode=("Mode to execute", "positional", None, str, ["train", "evaluate", "demo"]),
train_loc=("Path to training data", "option", "t", str),
dev_loc=("Path to development or test data", "option", "s", str),
max_length=("Length to truncate sentences", "option", "L", int),
nr_hidden=("Number of hidden units", "option", "H", int),
dropout=("Dropout level", "option", "d", float),
learn_rate=("Learning rate", "option", "r", float),
batch_size=("Batch size for neural network training", "option", "b", int),
nr_epoch=("Number of training epochs", "option", "e", int),
entail_dir=(
"Direction of entailment",
"option",
"D",
str,
["both", "left", "right"],
),
)
def main(
mode,
train_loc,
dev_loc,
max_length=50,
nr_hidden=200,
dropout=0.2,
learn_rate=0.001,
batch_size=1024,
nr_epoch=10,
entail_dir="both",
):
shape = (max_length, nr_hidden, 3)
settings = {
"lr": learn_rate,
"dropout": dropout,
"batch_size": batch_size,
"nr_epoch": nr_epoch,
"entail_dir": entail_dir,
}
if mode == "train":
if train_loc == None or dev_loc == None:
print("Train mode requires paths to training and development data sets.")
sys.exit(1)
train(train_loc, dev_loc, shape, settings)
elif mode == "evaluate":
if dev_loc == None:
print("Evaluate mode requires paths to test data set.")
sys.exit(1)
correct, total = evaluate(dev_loc, shape)
print(correct, "/", total, correct / total)
else:
demo(shape)
if __name__ == "__main__":
plac.call(main)