spaCy/examples/keras_parikh_entailment/spacy_hook.py

from keras.models import model_from_json
import numpy
import numpy.random
import json
from spacy.tokens.span import Span

try:
    import cPickle as pickle
except ImportError:
    import pickle


class KerasSimilarityShim(object):
    @classmethod
    def load(cls, path, nlp, get_features=None, max_length=100):
        if get_features is None:
            get_features = get_word_ids
        with (path / 'config.json').open() as file_:
            model = model_from_json(file_.read())
        with (path / 'model').open('rb') as file_:
            weights = pickle.load(file_)
        embeddings = get_embeddings(nlp.vocab)
        model.set_weights([embeddings] + weights)
        return cls(model, get_features=get_features, max_length=max_length)

    def __init__(self, model, get_features=None, max_length=100):
        self.model = model
        self.get_features = get_features
        self.max_length = max_length

    def __call__(self, doc):
        doc.user_hooks['similarity'] = self.predict
        doc.user_span_hooks['similarity'] = self.predict

    def predict(self, doc1, doc2):
        x1 = self.get_features([doc1], max_length=self.max_length, tree_truncate=True)
        x2 = self.get_features([doc2], max_length=self.max_length, tree_truncate=True)
        scores = self.model.predict([x1, x2])
        return scores[0]


def get_embeddings(vocab, nr_unk=100):
    nr_vector = max(lex.rank for lex in vocab) + 1
    vectors = numpy.zeros((nr_vector+nr_unk+2, vocab.vectors_length), dtype='float32')
    for lex in vocab:
        if lex.has_vector:
            vectors[lex.rank+1] = lex.vector / lex.vector_norm
    return vectors


def get_word_ids(docs, rnn_encode=False, tree_truncate=False, max_length=100, nr_unk=100):
    Xs = numpy.zeros((len(docs), max_length), dtype='int32')
    for i, doc in enumerate(docs):
        if tree_truncate:
            if isinstance(doc, Span):
                queue = [doc.root]
            else:
                queue = [sent.root for sent in doc.sents]
        else:
            queue = list(doc)
        words = []
        while len(words) <= max_length and queue:
            word = queue.pop(0)
            if rnn_encode or (not word.is_punct and not word.is_space):
                words.append(word)
            if tree_truncate:
                queue.extend(list(word.lefts))
                queue.extend(list(word.rights))
        words.sort()
        for j, token in enumerate(words):
            if token.has_vector:
                Xs[i, j] = token.rank+1
            else:
                Xs[i, j] = (token.shape % (nr_unk-1))+2
            j += 1
            if j >= max_length:
                break
        else:
            Xs[i, len(words)] = 1
    return Xs


def create_similarity_pipeline(nlp, max_length=100):
    return [
        nlp.tagger,
        nlp.entity,
        nlp.parser,
        KerasSimilarityShim.load(nlp.path / 'similarity', nlp, max_length)
    ]
Rename entailment example 2016-11-01 00:51:54 +00:00			`from keras.models import model_from_json`
Fix entailment example, and add a flag for BiRNN encoding. 2016-11-12 17:43:37 +00:00			`import numpy`
Add partial embedding updates to Parikh model, fix dropout, other corrections. 2016-11-18 12:32:12 +00:00			`import numpy.random`
Fix x keras deep learning example 2017-01-31 19:27:13 +00:00			`import json`
			`from spacy.tokens.span import Span`

			`try:`
			`import cPickle as pickle`
			`except ImportError:`
			`import pickle`
Rename entailment example 2016-11-01 00:51:54 +00:00

			`class KerasSimilarityShim(object):`
			`@classmethod`
Fix x keras deep learning example 2017-01-31 19:27:13 +00:00			`def load(cls, path, nlp, get_features=None, max_length=100):`
Rename entailment example 2016-11-01 00:51:54 +00:00			`if get_features is None:`
Fix x keras deep learning example 2017-01-31 19:27:13 +00:00			`get_features = get_word_ids`
Rename entailment example 2016-11-01 00:51:54 +00:00			`with (path / 'config.json').open() as file_:`
Fix x keras deep learning example 2017-01-31 19:27:13 +00:00			`model = model_from_json(file_.read())`
Rename entailment example 2016-11-01 00:51:54 +00:00			`with (path / 'model').open('rb') as file_:`
			`weights = pickle.load(file_)`
			`embeddings = get_embeddings(nlp.vocab)`
			`model.set_weights([embeddings] + weights)`
Fix x keras deep learning example 2017-01-31 19:27:13 +00:00			`return cls(model, get_features=get_features, max_length=max_length)`
Rename entailment example 2016-11-01 00:51:54 +00:00
Fix x keras deep learning example 2017-01-31 19:27:13 +00:00			`def __init__(self, model, get_features=None, max_length=100):`
Rename entailment example 2016-11-01 00:51:54 +00:00			`self.model = model`
			`self.get_features = get_features`
Fix x keras deep learning example 2017-01-31 19:27:13 +00:00			`self.max_length = max_length`
Rename entailment example 2016-11-01 00:51:54 +00:00
			`def __call__(self, doc):`
			`doc.user_hooks['similarity'] = self.predict`
			`doc.user_span_hooks['similarity'] = self.predict`
Fix x keras deep learning example 2017-01-31 19:27:13 +00:00
Rename entailment example 2016-11-01 00:51:54 +00:00			`def predict(self, doc1, doc2):`
Fix x keras deep learning example 2017-01-31 19:27:13 +00:00			`x1 = self.get_features([doc1], max_length=self.max_length, tree_truncate=True)`
			`x2 = self.get_features([doc2], max_length=self.max_length, tree_truncate=True)`
Rename entailment example 2016-11-01 00:51:54 +00:00			`scores = self.model.predict([x1, x2])`
			`return scores[0]`


Add partial embedding updates to Parikh model, fix dropout, other corrections. 2016-11-18 12:32:12 +00:00			`def get_embeddings(vocab, nr_unk=100):`
			`nr_vector = max(lex.rank for lex in vocab) + 1`
			`vectors = numpy.zeros((nr_vector+nr_unk+2, vocab.vectors_length), dtype='float32')`
Rename entailment example 2016-11-01 00:51:54 +00:00			`for lex in vocab:`
			`if lex.has_vector:`
Add partial embedding updates to Parikh model, fix dropout, other corrections. 2016-11-18 12:32:12 +00:00			`vectors[lex.rank+1] = lex.vector / lex.vector_norm`
Rename entailment example 2016-11-01 00:51:54 +00:00			`return vectors`


Add partial embedding updates to Parikh model, fix dropout, other corrections. 2016-11-18 12:32:12 +00:00			`def get_word_ids(docs, rnn_encode=False, tree_truncate=False, max_length=100, nr_unk=100):`
Rename entailment example 2016-11-01 00:51:54 +00:00			`Xs = numpy.zeros((len(docs), max_length), dtype='int32')`
			`for i, doc in enumerate(docs):`
Fix conflict 2016-11-13 14:52:20 +00:00			`if tree_truncate:`
Fix x keras deep learning example 2017-01-31 19:27:13 +00:00			`if isinstance(doc, Span):`
			`queue = [doc.root]`
			`else:`
			`queue = [sent.root for sent in doc.sents]`
Fix conflict 2016-11-13 14:52:20 +00:00			`else:`
			`queue = list(doc)`
Fix entailment example, and add a flag for BiRNN encoding. 2016-11-12 17:43:37 +00:00			`words = []`
			`while len(words) <= max_length and queue:`
			`word = queue.pop(0)`
Add partial embedding updates to Parikh model, fix dropout, other corrections. 2016-11-18 12:32:12 +00:00			`if rnn_encode or (not word.is_punct and not word.is_space):`
Fix entailment example, and add a flag for BiRNN encoding. 2016-11-12 17:43:37 +00:00			`words.append(word)`
Fix conflict 2016-11-13 14:52:20 +00:00			`if tree_truncate:`
Fix entailment example, and add a flag for BiRNN encoding. 2016-11-12 17:43:37 +00:00			`queue.extend(list(word.lefts))`
			`queue.extend(list(word.rights))`
			`words.sort()`
			`for j, token in enumerate(words):`
Add partial embedding updates to Parikh model, fix dropout, other corrections. 2016-11-18 12:32:12 +00:00			`if token.has_vector:`
			`Xs[i, j] = token.rank+1`
			`else:`
			`Xs[i, j] = (token.shape % (nr_unk-1))+2`
Fix entailment example, and add a flag for BiRNN encoding. 2016-11-12 17:43:37 +00:00			`j += 1`
			`if j >= max_length:`
			`break`
Add partial embedding updates to Parikh model, fix dropout, other corrections. 2016-11-18 12:32:12 +00:00			`else:`
			`Xs[i, len(words)] = 1`
Rename entailment example 2016-11-01 00:51:54 +00:00			`return Xs`


Set max_length to 100 for demo and evaluate 2017-04-05 11:18:35 +00:00			`def create_similarity_pipeline(nlp, max_length=100):`
Fix x keras deep learning example 2017-01-31 19:27:13 +00:00			`return [`
			`nlp.tagger,`
			`nlp.entity,`
			`nlp.parser,`
Set max_length to 100 for demo and evaluate 2017-04-05 11:18:35 +00:00			`KerasSimilarityShim.load(nlp.path / 'similarity', nlp, max_length)`
Fix x keras deep learning example 2017-01-31 19:27:13 +00:00			`]`