Add experimental SimilarityHook omponent

This commit is contained in:
Matthew Honnibal 2017-06-05 15:40:03 +02:00
parent f67efc4720
commit d59fa32df1
1 changed files with 101 additions and 1 deletions

View File

@ -18,6 +18,9 @@ from thinc.neural import Model, Maxout, Softmax, Affine
from thinc.neural._classes.hash_embed import HashEmbed
from thinc.neural.util import to_categorical
from thinc.neural.pooling import Pooling, max_pool, mean_pool
from thinc.neural._classes.difference import Siamese, CauchySimilarity
from thinc.neural._classes.convolution import ExtractWindow
from thinc.neural._classes.resnet import Residual
from thinc.neural._classes.batchnorm import BatchNorm as BN
@ -362,7 +365,6 @@ class NeuralTagger(object):
self.vocab.strings, tag_map=tag_map,
lemmatizer=self.vocab.morphology.lemmatizer,
exc=self.vocab.morphology.exc)
deserialize = OrderedDict((
('vocab', lambda p: self.vocab.from_disk(p)),
@ -421,6 +423,104 @@ class NeuralLabeller(NeuralTagger):
return float(loss), d_scores
class SimilarityHook(object):
"""
Experimental
A pipeline component to install a hook for supervised similarity into
Doc objects. Requires a Tensorizer to pre-process documents. The similarity
model can be any object obeying the Thinc Model interface. By default,
the model concatenates the elementwise mean and elementwise max of the two
tensors, and compares them using the Cauchy-like similarity function
from Chen (2013):
similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
Where W is a vector of dimension weights, initialized to 1.
"""
name = 'similarity'
def __init__(self, vocab, model=True):
self.vocab = vocab
self.model = model
@classmethod
def Model(cls, length):
return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
def __call__(self, doc):
'''Install similarity hook'''
doc.user_hooks['similarity'] = self.predict
return doc
def pipe(self, docs, **kwargs):
for doc in docs:
yield self(doc)
def predict(self, doc1, doc2):
return self.model.predict([(doc1.tensor, doc2.tensor)])
def update(self, doc1_tensor1_doc2_tensor2, golds, sgd=None, drop=0.):
doc1s, tensor1s, doc2s, tensor2s = doc1_tensor1_doc2_tensor2
sims, bp_sims = self.model.begin_update(zip(tensor1s, tensor2s),
drop=drop)
d_tensor1s, d_tensor2s = bp_sims(golds, sgd=sgd)
return d_tensor1s, d_tensor2s
def begin_training(self, _, pipeline=None):
"""
Allocate model, using width from tensorizer in pipeline.
gold_tuples (iterable): Gold-standard training data.
pipeline (list): The pipeline the model is part of.
"""
if self.model is True:
self.model = self.Model(pipeline[0].model.nO)
def use_params(self, params):
"""Replace weights of models in the pipeline with those provided in the
params dictionary.
params (dict): A dictionary of parameters keyed by model ID.
"""
with self.model.use_params(params):
yield
def to_bytes(self, **exclude):
serialize = OrderedDict((
('model', lambda: self.model.to_bytes()),
('vocab', lambda: self.vocab.to_bytes())
))
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude):
if self.model is True:
self.model = self.Model()
deserialize = OrderedDict((
('model', lambda b: self.model.from_bytes(b)),
('vocab', lambda b: self.vocab.from_bytes(b))
))
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(self, path, **exclude):
serialize = OrderedDict((
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
('vocab', lambda p: self.vocab.to_disk(p))
))
util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude):
if self.model is True:
self.model = self.Model()
deserialize = OrderedDict((
('model', lambda p: self.model.from_bytes(p.open('rb').read())),
('vocab', lambda p: self.vocab.from_disk(p))
))
util.from_disk(path, deserialize, exclude)
return self
cdef class EntityRecognizer(LinearParser):
"""Annotate named entities on Doc objects."""
TransitionSystem = BiluoPushDown