Experimental character-based pretraining (#5700)

* Use cosine loss in Cloze multitask

* Fix char_embed for gpu

* Call resume_training for base model in train CLI

* Fix bilstm_depth default in pretrain command

* Implement character-based pretraining objective

* Use chars loss in ClozeMultitask

* Add method to decode predicted characters

* Fix number characters

* Rescale gradients for mlm

* Fix char embed+vectors in ml

* Fix pipes

* Fix pretrain args

* Move get_characters_loss

* Fix import

* Fix import

* Mention characters loss option in pretrain

* Remove broken 'self attention' option in pretrain

* Revert "Remove broken 'self attention' option in pretrain"

This reverts commit 56b820f6af.

* Document 'characters' objective of pretrain
This commit is contained in:
Matthew Honnibal 2020-07-05 15:48:39 +02:00 committed by GitHub
parent 86d13a9fb8
commit 3e78e82a83
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 92 additions and 33 deletions

View File

@ -14,7 +14,7 @@ from thinc.api import with_getitem, flatten_add_lengths
from thinc.api import uniqued, wrap, noop from thinc.api import uniqued, wrap, noop
from thinc.linear.linear import LinearModel from thinc.linear.linear import LinearModel
from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module, copy_array from thinc.neural.util import get_array_module, copy_array, to_categorical
from thinc.neural.optimizers import Adam from thinc.neural.optimizers import Adam
from thinc import describe from thinc import describe
@ -840,6 +840,8 @@ def masked_language_model(vocab, model, mask_prob=0.15):
def mlm_backward(d_output, sgd=None): def mlm_backward(d_output, sgd=None):
d_output *= 1 - mask d_output *= 1 - mask
# Rescale gradient for number of instances.
d_output *= mask.size - mask.sum()
return backprop(d_output, sgd=sgd) return backprop(d_output, sgd=sgd)
return output, mlm_backward return output, mlm_backward
@ -944,7 +946,7 @@ class CharacterEmbed(Model):
# for the tip. # for the tip.
nCv = self.ops.xp.arange(self.nC) nCv = self.ops.xp.arange(self.nC)
for doc in docs: for doc in docs:
doc_ids = doc.to_utf8_array(nr_char=self.nC) doc_ids = self.ops.asarray(doc.to_utf8_array(nr_char=self.nC))
doc_vectors = self.ops.allocate((len(doc), self.nC, self.nM)) doc_vectors = self.ops.allocate((len(doc), self.nC, self.nM))
# Let's say I have a 2d array of indices, and a 3d table of data. What numpy # Let's say I have a 2d array of indices, and a 3d table of data. What numpy
# incantation do I chant to get # incantation do I chant to get
@ -986,3 +988,17 @@ def get_cossim_loss(yh, y, ignore_zeros=False):
losses[zero_indices] = 0 losses[zero_indices] = 0
loss = losses.sum() loss = losses.sum()
return loss, -d_yh return loss, -d_yh
def get_characters_loss(ops, docs, prediction, nr_char=10):
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
target_ids = target_ids.reshape((-1,))
target = ops.asarray(to_categorical(target_ids, nb_classes=256), dtype="f")
target = target.reshape((-1, 256*nr_char))
diff = prediction - target
loss = (diff**2).sum()
d_target = diff / float(prediction.shape[0])
return loss, d_target

View File

@ -18,7 +18,8 @@ from ..errors import Errors
from ..tokens import Doc from ..tokens import Doc
from ..attrs import ID, HEAD from ..attrs import ID, HEAD
from .._ml import Tok2Vec, flatten, chain, create_default_optimizer from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
from .._ml import masked_language_model, get_cossim_loss from .._ml import masked_language_model, get_cossim_loss, get_characters_loss
from .._ml import MultiSoftmax
from .. import util from .. import util
from .train import _load_pretrained_tok2vec from .train import _load_pretrained_tok2vec
@ -42,7 +43,7 @@ from .train import _load_pretrained_tok2vec
bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int), bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int),
embed_rows=("Number of embedding rows", "option", "er", int), embed_rows=("Number of embedding rows", "option", "er", int),
loss_func=( loss_func=(
"Loss function to use for the objective. Either 'L2' or 'cosine'", "Loss function to use for the objective. Either 'characters', 'L2' or 'cosine'",
"option", "option",
"L", "L",
str, str,
@ -85,11 +86,11 @@ def pretrain(
output_dir, output_dir,
width=96, width=96,
conv_depth=4, conv_depth=4,
bilstm_depth=0,
cnn_pieces=3, cnn_pieces=3,
sa_depth=0, sa_depth=0,
use_chars=False,
cnn_window=1, cnn_window=1,
bilstm_depth=0,
use_chars=False,
embed_rows=2000, embed_rows=2000,
loss_func="cosine", loss_func="cosine",
use_vectors=False, use_vectors=False,
@ -124,11 +125,7 @@ def pretrain(
config[key] = str(config[key]) config[key] = str(config[key])
util.fix_random_seed(seed) util.fix_random_seed(seed)
has_gpu = prefer_gpu() has_gpu = prefer_gpu(gpu_id=1)
if has_gpu:
import torch
torch.set_default_tensor_type("torch.cuda.FloatTensor")
msg.info("Using GPU" if has_gpu else "Not using GPU") msg.info("Using GPU" if has_gpu else "Not using GPU")
output_dir = Path(output_dir) output_dir = Path(output_dir)
@ -174,6 +171,7 @@ def pretrain(
subword_features=not use_chars, # Set to False for Chinese etc subword_features=not use_chars, # Set to False for Chinese etc
cnn_maxout_pieces=cnn_pieces, # If set to 1, use Mish activation. cnn_maxout_pieces=cnn_pieces, # If set to 1, use Mish activation.
), ),
objective=loss_func
) )
# Load in pretrained weights # Load in pretrained weights
if init_tok2vec is not None: if init_tok2vec is not None:
@ -264,7 +262,10 @@ def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
RETURNS loss: A float for the loss. RETURNS loss: A float for the loss.
""" """
predictions, backprop = model.begin_update(docs, drop=drop) predictions, backprop = model.begin_update(docs, drop=drop)
loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective) if objective == "characters":
loss, gradients = get_characters_loss(model.ops, docs, predictions)
else:
loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
backprop(gradients, sgd=optimizer) backprop(gradients, sgd=optimizer)
# Don't want to return a cupy object here # Don't want to return a cupy object here
# The gradients are modified in-place by the BERT MLM, # The gradients are modified in-place by the BERT MLM,
@ -326,16 +327,23 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"):
return loss, d_target return loss, d_target
def create_pretraining_model(nlp, tok2vec): def create_pretraining_model(nlp, tok2vec, objective="cosine", nr_char=10):
"""Define a network for the pretraining. We simply add an output layer onto """Define a network for the pretraining. We simply add an output layer onto
the tok2vec input model. The tok2vec input model needs to be a model that the tok2vec input model. The tok2vec input model needs to be a model that
takes a batch of Doc objects (as a list), and returns a list of arrays. takes a batch of Doc objects (as a list), and returns a list of arrays.
Each array in the output needs to have one row per token in the doc. Each array in the output needs to have one row per token in the doc.
""" """
output_size = nlp.vocab.vectors.data.shape[1] if objective == "characters":
output_layer = chain( out_sizes = [256] * nr_char
LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0) output_layer = chain(
) LN(Maxout(300, pieces=3)),
MultiSoftmax(out_sizes, 300)
)
else:
output_size = nlp.vocab.vectors.data.shape[1]
output_layer = chain(
LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
)
# This is annoying, but the parser etc have the flatten step after # This is annoying, but the parser etc have the flatten step after
# the tok2vec. To load the weights in cleanly, we need to match # the tok2vec. To load the weights in cleanly, we need to match
# the shape of the models' components exactly. So what we cann # the shape of the models' components exactly. So what we cann

View File

@ -285,7 +285,7 @@ def train(
if base_model and not pipes_added: if base_model and not pipes_added:
# Start with an existing model, use default optimizer # Start with an existing model, use default optimizer
optimizer = create_default_optimizer(Model.ops) optimizer = nlp.resume_training(device=use_gpu)
else: else:
# Start with a blank model, call begin_training # Start with a blank model, call begin_training
cfg = {"device": use_gpu} cfg = {"device": use_gpu}

View File

@ -49,6 +49,14 @@ def Tok2Vec(width, embed_size, **kwargs):
>> LN(Maxout(width, width * 5, pieces=3)), >> LN(Maxout(width, width * 5, pieces=3)),
column=cols.index(ORTH), column=cols.index(ORTH),
) )
elif char_embed:
embed = concatenate_lists(
CharacterEmbed(nM=64, nC=8),
FeatureExtracter(cols) >> with_flatten(glove),
)
reduce_dimensions = LN(
Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces)
)
else: else:
embed = uniqued( embed = uniqued(
(glove | norm) >> LN(Maxout(width, width * 2, pieces=3)), (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
@ -81,7 +89,8 @@ def Tok2Vec(width, embed_size, **kwargs):
) )
else: else:
tok2vec = FeatureExtracter(cols) >> with_flatten( tok2vec = FeatureExtracter(cols) >> with_flatten(
embed >> convolution ** conv_depth, pad=conv_depth embed
>> convolution ** conv_depth, pad=conv_depth
) )
if bilstm_depth >= 1: if bilstm_depth >= 1:

View File

@ -33,6 +33,7 @@ from .._ml import build_text_classifier, build_simple_cnn_text_classifier
from .._ml import build_bow_text_classifier, build_nel_encoder from .._ml import build_bow_text_classifier, build_nel_encoder
from .._ml import link_vectors_to_models, zero_init, flatten from .._ml import link_vectors_to_models, zero_init, flatten
from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss
from .._ml import MultiSoftmax, get_characters_loss
from ..errors import Errors, TempErrors, Warnings from ..errors import Errors, TempErrors, Warnings
from .. import util from .. import util
@ -846,11 +847,15 @@ class MultitaskObjective(Tagger):
class ClozeMultitask(Pipe): class ClozeMultitask(Pipe):
@classmethod @classmethod
def Model(cls, vocab, tok2vec, **cfg): def Model(cls, vocab, tok2vec, **cfg):
output_size = vocab.vectors.data.shape[1] if cfg["objective"] == "characters":
output_layer = chain( out_sizes = [256] * cfg.get("nr_char", 4)
LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)), output_layer = MultiSoftmax(out_sizes)
zero_init(Affine(output_size, output_size, drop_factor=0.0)) else:
) output_size = vocab.vectors.data.shape[1]
output_layer = chain(
LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)),
zero_init(Affine(output_size, output_size, drop_factor=0.0))
)
model = chain(tok2vec, output_layer) model = chain(tok2vec, output_layer)
model = masked_language_model(vocab, model) model = masked_language_model(vocab, model)
model.tok2vec = tok2vec model.tok2vec = tok2vec
@ -861,6 +866,8 @@ class ClozeMultitask(Pipe):
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
self.cfg = cfg self.cfg = cfg
self.cfg.setdefault("objective", "characters")
self.cfg.setdefault("nr_char", 4)
def set_annotations(self, docs, dep_ids, tensors=None): def set_annotations(self, docs, dep_ids, tensors=None):
pass pass
@ -869,7 +876,8 @@ class ClozeMultitask(Pipe):
tok2vec=None, sgd=None, **kwargs): tok2vec=None, sgd=None, **kwargs):
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
if self.model is True: if self.model is True:
self.model = self.Model(self.vocab, tok2vec) kwargs.update(self.cfg)
self.model = self.Model(self.vocab, tok2vec, **kwargs)
X = self.model.ops.allocate((5, self.model.tok2vec.nO)) X = self.model.ops.allocate((5, self.model.tok2vec.nO))
self.model.output_layer.begin_training(X) self.model.output_layer.begin_training(X)
if sgd is None: if sgd is None:
@ -883,13 +891,16 @@ class ClozeMultitask(Pipe):
return tokvecs, vectors return tokvecs, vectors
def get_loss(self, docs, vectors, prediction): def get_loss(self, docs, vectors, prediction):
# The simplest way to implement this would be to vstack the if self.cfg["objective"] == "characters":
# token.vector values, but that's a bit inefficient, especially on GPU. loss, gradient = get_characters_loss(self.model.ops, docs, prediction)
# Instead we fetch the index into the vectors table for each of our tokens, else:
# and look them up all at once. This prevents data copying. # The simplest way to implement this would be to vstack the
ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs]) # token.vector values, but that's a bit inefficient, especially on GPU.
target = vectors[ids] # Instead we fetch the index into the vectors table for each of our tokens,
loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True) # and look them up all at once. This prevents data copying.
ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = vectors[ids]
loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
return float(loss), gradient return float(loss), gradient
def update(self, docs, golds, drop=0., sgd=None, losses=None): def update(self, docs, golds, drop=0., sgd=None, losses=None):
@ -906,6 +917,20 @@ class ClozeMultitask(Pipe):
if losses is not None: if losses is not None:
losses[self.name] += loss losses[self.name] += loss
@staticmethod
def decode_utf8_predictions(char_array):
# The format alternates filling from start and end, and 255 is missing
words = []
char_array = char_array.reshape((char_array.shape[0], -1, 256))
nr_char = char_array.shape[1]
char_array = char_array.argmax(axis=-1)
for row in char_array:
starts = [chr(c) for c in row[::2] if c != 255]
ends = [chr(c) for c in row[1::2] if c != 255]
word = "".join(starts + list(reversed(ends)))
words.append(word)
return words
@component("textcat", assigns=["doc.cats"]) @component("textcat", assigns=["doc.cats"])
class TextCategorizer(Pipe): class TextCategorizer(Pipe):
@ -1069,6 +1094,7 @@ cdef class DependencyParser(Parser):
assigns = ["token.dep", "token.is_sent_start", "doc.sents"] assigns = ["token.dep", "token.is_sent_start", "doc.sents"]
requires = [] requires = []
TransitionSystem = ArcEager TransitionSystem = ArcEager
nr_feature = 8
@property @property
def postprocesses(self): def postprocesses(self):

View File

@ -473,7 +473,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
| `--use-chars`, `-chr` <Tag variant="new">2.2.2</Tag> | flag | Whether to use character-based embedding. | | `--use-chars`, `-chr` <Tag variant="new">2.2.2</Tag> | flag | Whether to use character-based embedding. |
| `--sa-depth`, `-sa` <Tag variant="new">2.2.2</Tag> | option | Depth of self-attention layers. | | `--sa-depth`, `-sa` <Tag variant="new">2.2.2</Tag> | option | Depth of self-attention layers. |
| `--embed-rows`, `-er` | option | Number of embedding rows. | | `--embed-rows`, `-er` | option | Number of embedding rows. |
| `--loss-func`, `-L` | option | Loss function to use for the objective. Either `"L2"` or `"cosine"`. | | `--loss-func`, `-L` | option | Loss function to use for the objective. Either `"cosine"`, `"L2"` or `"characters"`. |
| `--dropout`, `-d` | option | Dropout rate. | | `--dropout`, `-d` | option | Dropout rate. |
| `--batch-size`, `-bs` | option | Number of words per training batch. | | `--batch-size`, `-bs` | option | Number of words per training batch. |
| `--max-length`, `-xw` | option | Maximum words per example. Longer examples are discarded. | | `--max-length`, `-xw` | option | Maximum words per example. Longer examples are discarded. |