mirror of https://github.com/explosion/spaCy.git
78 lines
2.4 KiB
Python
78 lines
2.4 KiB
Python
import numpy as np
|
|
from keras.models import model_from_json
|
|
|
|
try:
|
|
import cPickle as pickle
|
|
except ImportError:
|
|
import pickle
|
|
|
|
|
|
class KerasSimilarityShim(object):
|
|
entailment_types = ["entailment", "contradiction", "neutral"]
|
|
|
|
@classmethod
|
|
def load(cls, path, nlp, max_length=100, get_features=None):
|
|
|
|
if get_features is None:
|
|
get_features = get_word_ids
|
|
|
|
with (path / 'config.json').open() as file_:
|
|
model = model_from_json(file_.read())
|
|
with (path / 'model').open('rb') as file_:
|
|
weights = pickle.load(file_)
|
|
|
|
embeddings = get_embeddings(nlp.vocab)
|
|
weights.insert(1, embeddings)
|
|
model.set_weights(weights)
|
|
|
|
return cls(model, get_features=get_features, max_length=max_length)
|
|
|
|
def __init__(self, model, get_features=None, max_length=100):
|
|
self.model = model
|
|
self.get_features = get_features
|
|
self.max_length = max_length
|
|
|
|
def __call__(self, doc):
|
|
doc.user_hooks['similarity'] = self.predict
|
|
doc.user_span_hooks['similarity'] = self.predict
|
|
|
|
return doc
|
|
|
|
def predict(self, doc1, doc2):
|
|
x1 = self.get_features([doc1], max_length=self.max_length)
|
|
x2 = self.get_features([doc2], max_length=self.max_length)
|
|
scores = self.model.predict([x1, x2])
|
|
|
|
return self.entailment_types[scores.argmax()], scores.max()
|
|
|
|
|
|
def get_embeddings(vocab, nr_unk=100):
|
|
# the extra +1 is for a zero vector representing sentence-final padding
|
|
num_vectors = max(lex.rank for lex in vocab) + 2
|
|
|
|
# create random vectors for OOV tokens
|
|
oov = np.random.normal(size=(nr_unk, vocab.vectors_length))
|
|
oov = oov / oov.sum(axis=1, keepdims=True)
|
|
|
|
vectors = np.zeros((num_vectors + nr_unk, vocab.vectors_length), dtype='float32')
|
|
vectors[1:(nr_unk + 1), ] = oov
|
|
for lex in vocab:
|
|
if lex.has_vector and lex.vector_norm > 0:
|
|
vectors[nr_unk + lex.rank + 1] = lex.vector / lex.vector_norm
|
|
|
|
return vectors
|
|
|
|
|
|
def get_word_ids(docs, max_length=100, nr_unk=100):
|
|
Xs = np.zeros((len(docs), max_length), dtype='int32')
|
|
|
|
for i, doc in enumerate(docs):
|
|
for j, token in enumerate(doc):
|
|
if j == max_length:
|
|
break
|
|
if token.has_vector:
|
|
Xs[i, j] = token.rank + nr_unk + 1
|
|
else:
|
|
Xs[i, j] = token.rank % nr_unk + 1
|
|
return Xs
|