diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 684b27ad0..a2ba1d243 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -100,11 +100,11 @@ def pretrain( pretrained_vectors=pretrained_vectors, bilstm_depth=0, # Requires PyTorch. Experimental. cnn_maxout_pieces=3, # You can try setting this higher - subword_features=True, + subword_features=True, # Set to False for Chinese etc ), - ) # Set to False for character models, e.g. Chinese + ) optimizer = create_default_optimizer(model.ops) - tracker = ProgressTracker() + tracker = ProgressTracker(frequency=10000) msg.divider("Pre-training tok2vec layer") row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) @@ -136,7 +136,7 @@ def pretrain( random.shuffle(texts) -def make_update(model, docs, optimizer, drop=0.0, objective='cosine'): +def make_update(model, docs, optimizer, drop=0.0, objective='L2'): """Perform an update over a single batch of documents. docs (iterable): A batch of `Doc` objects. @@ -145,13 +145,12 @@ def make_update(model, docs, optimizer, drop=0.0, objective='cosine'): RETURNS loss: A float for the loss. """ predictions, backprop = model.begin_update(docs, drop=drop) - gradients = get_vectors_loss(model.ops, docs, predictions, objective) + loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective) backprop(gradients, sgd=optimizer) # Don't want to return a cupy object here # The gradients are modified in-place by the BERT MLM, # so we get an accurate loss - loss = float((gradients ** 2).sum()) - return loss + return float(loss) def make_docs(nlp, batch, min_length=1, max_length=500): @@ -172,7 +171,7 @@ def make_docs(nlp, batch, min_length=1, max_length=500): return docs -def get_vectors_loss(ops, docs, prediction, objective): +def get_vectors_loss(ops, docs, prediction, objective='L2'): """Compute a mean-squared error loss between the documents' vectors and the prediction. @@ -188,80 +187,23 @@ def get_vectors_loss(ops, docs, prediction, objective): target = docs[0].vocab.vectors.data[ids] if objective == 'L2': d_scores = prediction - target - elif objective == 'nllvmf': - d_scores = get_nllvmf_loss(prediction, target) + loss = (d_scores**2).sum() else: - d_scores = get_cossim_loss(prediction, target) - return d_scores + raise NotImplementedError(objective) + return loss, d_scores -def get_cossim_loss(yh, y): - # Add a small constant to avoid 0 vectors - yh = yh + 1e-8 - y = y + 1e-8 - # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity - xp = get_array_module(yh) - norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True) - norm_y = xp.linalg.norm(y, axis=1, keepdims=True) - mul_norms = norm_yh * norm_y - cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms - d_yh = (y / mul_norms) - (cosine * (yh / norm_yh**2)) - return d_yh - - -def get_nllvmf_loss(Yh, Y): - """Compute the gradient of the negative log likelihood von Mises-Fisher loss, - from Kumar and Tsetskov. - Yh: Predicted vectors. - Y: True vectors - Returns dYh: Gradient of loss with respect to prediction. - """ - # Warning: Probably wrong? Also needs normalization - xp = get_array_module(Yh) - assert not xp.isnan(Yh).any() - assert not xp.isnan(Y).any() - return _backprop_bessel(Yh) * Y - - -def _backprop_bessel(k, approximate=True): - if approximate: - return -_ratio(k.shape[1]/2, k) - from scipy.special import ive - xp = get_array_module(k) - if not isinstance(k, numpy.ndarray): - k = k.get() - k = numpy.asarray(k, dtype='float64') - assert not numpy.isnan(k).any() - m = k.shape[1] - numerator = ive(m/2, k) - assert not numpy.isnan(numerator).any() - denom = ive(m/2-1, k) - assert not numpy.isnan(denom).any() - x = -(numerator / (denom+1e-8)) - assert not numpy.isnan(x).any() - return xp.array(x, dtype='f') - - -def _ratio(v, z): - return z/(v-1+numpy.sqrt((v+1)**2 + z**2, dtype='f')) - - - -def create_pretraining_model(nlp, tok2vec, normalized=False): +def create_pretraining_model(nlp, tok2vec): """Define a network for the pretraining. We simply add an output layer onto the tok2vec input model. The tok2vec input model needs to be a model that takes a batch of Doc objects (as a list), and returns a list of arrays. Each array in the output needs to have one row per token in the doc. """ - if normalized: - normalize_vectors(nlp.vocab.vectors.data) output_size = nlp.vocab.vectors.data.shape[1] output_layer = chain( LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0), ) - if normalized: - output_layer = chain(output_layer, normalize) # This is annoying, but the parser etc have the flatten step after # the tok2vec. To load the weights in cleanly, we need to match # the shape of the models' components exactly. So what we cann @@ -275,28 +217,6 @@ def create_pretraining_model(nlp, tok2vec, normalized=False): return model -@layerize -def normalize(X, drop=0.): - xp = get_array_module(X) - norms = xp.sqrt((X**2).sum(axis=1, keepdims=True)+1e-8) - Y = X / norms - def backprop_normalize(dY, sgd=None): - d_norms = 2 * norms - #dY = (dX * norms - X * d_norms) / norms**2 - #dY * norms**2 = dX * norms - X * d_norms - #dY * norms**2 + X * d_norms = dX * norms - #(dY * norms**2 + X * d_norms) / norms = dX - dX = (dY * norms**2 + X * d_norms) / norms - return dX - return Y, backprop_normalize - - -def normalize_vectors(vectors_data): - xp = get_array_module(vectors_data) - norms = xp.sqrt((vectors_data**2).sum(axis=1, keepdims=True)+1e-8) - vectors_data /= norms - - class ProgressTracker(object): def __init__(self, frequency=1000000): self.loss = 0.0