diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 9a9d9e213..684b27ad0 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -8,9 +8,9 @@ import time from collections import Counter from pathlib import Path from thinc.v2v import Affine, Maxout -from thinc.api import wrap +from thinc.api import wrap, layerize from thinc.misc import LayerNorm as LN -from thinc.neural.util import prefer_gpu +from thinc.neural.util import prefer_gpu, get_array_module from wasabi import Printer import srsly @@ -99,7 +99,7 @@ def pretrain( conv_depth=depth, pretrained_vectors=pretrained_vectors, bilstm_depth=0, # Requires PyTorch. Experimental. - cnn_maxout_pieces=2, # You can try setting this higher + cnn_maxout_pieces=3, # You can try setting this higher subword_features=True, ), ) # Set to False for character models, e.g. Chinese @@ -136,7 +136,7 @@ def pretrain( random.shuffle(texts) -def make_update(model, docs, optimizer, drop=0.0): +def make_update(model, docs, optimizer, drop=0.0, objective='cosine'): """Perform an update over a single batch of documents. docs (iterable): A batch of `Doc` objects. @@ -145,12 +145,12 @@ def make_update(model, docs, optimizer, drop=0.0): RETURNS loss: A float for the loss. """ predictions, backprop = model.begin_update(docs, drop=drop) - gradients = get_vectors_loss(model.ops, docs, predictions) + gradients = get_vectors_loss(model.ops, docs, predictions, objective) backprop(gradients, sgd=optimizer) # Don't want to return a cupy object here # The gradients are modified in-place by the BERT MLM, # so we get an accurate loss - loss = float((gradients ** 2).mean()) + loss = float((gradients ** 2).sum()) return loss @@ -172,7 +172,7 @@ def make_docs(nlp, batch, min_length=1, max_length=500): return docs -def get_vectors_loss(ops, docs, prediction): +def get_vectors_loss(ops, docs, prediction, objective): """Compute a mean-squared error loss between the documents' vectors and the prediction. @@ -186,20 +186,82 @@ def get_vectors_loss(ops, docs, prediction): # and look them up all at once. This prevents data copying. ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) target = docs[0].vocab.vectors.data[ids] - d_scores = prediction - target + if objective == 'L2': + d_scores = prediction - target + elif objective == 'nllvmf': + d_scores = get_nllvmf_loss(prediction, target) + else: + d_scores = get_cossim_loss(prediction, target) return d_scores -def create_pretraining_model(nlp, tok2vec): +def get_cossim_loss(yh, y): + # Add a small constant to avoid 0 vectors + yh = yh + 1e-8 + y = y + 1e-8 + # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity + xp = get_array_module(yh) + norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True) + norm_y = xp.linalg.norm(y, axis=1, keepdims=True) + mul_norms = norm_yh * norm_y + cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms + d_yh = (y / mul_norms) - (cosine * (yh / norm_yh**2)) + return d_yh + + +def get_nllvmf_loss(Yh, Y): + """Compute the gradient of the negative log likelihood von Mises-Fisher loss, + from Kumar and Tsetskov. + Yh: Predicted vectors. + Y: True vectors + Returns dYh: Gradient of loss with respect to prediction. + """ + # Warning: Probably wrong? Also needs normalization + xp = get_array_module(Yh) + assert not xp.isnan(Yh).any() + assert not xp.isnan(Y).any() + return _backprop_bessel(Yh) * Y + + +def _backprop_bessel(k, approximate=True): + if approximate: + return -_ratio(k.shape[1]/2, k) + from scipy.special import ive + xp = get_array_module(k) + if not isinstance(k, numpy.ndarray): + k = k.get() + k = numpy.asarray(k, dtype='float64') + assert not numpy.isnan(k).any() + m = k.shape[1] + numerator = ive(m/2, k) + assert not numpy.isnan(numerator).any() + denom = ive(m/2-1, k) + assert not numpy.isnan(denom).any() + x = -(numerator / (denom+1e-8)) + assert not numpy.isnan(x).any() + return xp.array(x, dtype='f') + + +def _ratio(v, z): + return z/(v-1+numpy.sqrt((v+1)**2 + z**2, dtype='f')) + + + +def create_pretraining_model(nlp, tok2vec, normalized=False): """Define a network for the pretraining. We simply add an output layer onto the tok2vec input model. The tok2vec input model needs to be a model that takes a batch of Doc objects (as a list), and returns a list of arrays. Each array in the output needs to have one row per token in the doc. """ + if normalized: + normalize_vectors(nlp.vocab.vectors.data) output_size = nlp.vocab.vectors.data.shape[1] output_layer = chain( - LN(Maxout(300, pieces=3)), zero_init(Affine(output_size, drop_factor=0.0)) + LN(Maxout(300, pieces=3)), + Affine(output_size, drop_factor=0.0), ) + if normalized: + output_layer = chain(output_layer, normalize) # This is annoying, but the parser etc have the flatten step after # the tok2vec. To load the weights in cleanly, we need to match # the shape of the models' components exactly. So what we cann @@ -213,6 +275,28 @@ def create_pretraining_model(nlp, tok2vec): return model +@layerize +def normalize(X, drop=0.): + xp = get_array_module(X) + norms = xp.sqrt((X**2).sum(axis=1, keepdims=True)+1e-8) + Y = X / norms + def backprop_normalize(dY, sgd=None): + d_norms = 2 * norms + #dY = (dX * norms - X * d_norms) / norms**2 + #dY * norms**2 = dX * norms - X * d_norms + #dY * norms**2 + X * d_norms = dX * norms + #(dY * norms**2 + X * d_norms) / norms = dX + dX = (dY * norms**2 + X * d_norms) / norms + return dX + return Y, backprop_normalize + + +def normalize_vectors(vectors_data): + xp = get_array_module(vectors_data) + norms = xp.sqrt((vectors_data**2).sum(axis=1, keepdims=True)+1e-8) + vectors_data /= norms + + class ProgressTracker(object): def __init__(self, frequency=1000000): self.loss = 0.0 @@ -239,8 +323,8 @@ class ProgressTracker(object): status = ( epoch, self.nr_word, - "%.5f" % self.loss, - "%.4f" % loss_per_word, + "%.8f" % self.loss, + "%.8f" % loss_per_word, int(wps), ) self.prev_loss = float(self.loss)