diff --git a/requirements.txt b/requirements.txt index e2a1860ea..e095d04fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ cython>=0.25 numpy>=1.15.0 cymem>=2.0.2,<2.1.0 preshed>=2.0.1,<2.1.0 -thinc==7.0.0.dev0 +thinc==7.0.0.dev1 blis>=0.2.2,<0.3.0 murmurhash>=0.28.0,<1.1.0 cytoolz>=0.9.0,<0.10.0 diff --git a/setup.py b/setup.py index 0bf48f709..3c7bd51d8 100755 --- a/setup.py +++ b/setup.py @@ -200,7 +200,7 @@ def setup_package(): "murmurhash>=0.28.0,<1.1.0", "cymem>=2.0.2,<2.1.0", "preshed>=2.0.1,<2.1.0", - "thinc==7.0.0.dev0", + "thinc==7.0.0.dev1", "blis>=0.2.2,<0.3.0", "plac<1.0.0,>=0.9.6", "ujson>=1.35", diff --git a/spacy/_ml.py b/spacy/_ml.py index 679b1aef6..0cfdec7e9 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -48,11 +48,11 @@ def cosine(vec1, vec2): def create_default_optimizer(ops, **cfg): learn_rate = util.env_opt('learn_rate', 0.001) - beta1 = util.env_opt('optimizer_B1', 0.9) - beta2 = util.env_opt('optimizer_B2', 0.9) - eps = util.env_opt('optimizer_eps', 1e-12) + beta1 = util.env_opt('optimizer_B1', 0.8) + beta2 = util.env_opt('optimizer_B2', 0.8) + eps = util.env_opt('optimizer_eps', 0.00001) L2 = util.env_opt('L2_penalty', 1e-6) - max_grad_norm = util.env_opt('grad_norm_clip', 1.) + max_grad_norm = util.env_opt('grad_norm_clip', 5.) optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps) optimizer.max_grad_norm = max_grad_norm @@ -445,11 +445,11 @@ def getitem(i): def build_tagger_model(nr_class, **cfg): - embed_size = util.env_opt('embed_size', 7000) + embed_size = util.env_opt('embed_size', 2000) if 'token_vector_width' in cfg: token_vector_width = cfg['token_vector_width'] else: - token_vector_width = util.env_opt('token_vector_width', 128) + token_vector_width = util.env_opt('token_vector_width', 96) pretrained_vectors = cfg.get('pretrained_vectors') subword_features = cfg.get('subword_features', True) with Model.define_operators({'>>': chain, '+': add}): diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index f46d41452..44f3f3174 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -24,10 +24,12 @@ import sys from collections import Counter import spacy -from spacy.attrs import ID +from spacy.tokens import Doc +from spacy.attrs import ID, HEAD from spacy.util import minibatch, minibatch_by_words, use_gpu, compounding, ensure_path from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer from thinc.v2v import Affine +from thinc.api import wrap def prefer_gpu(): @@ -47,13 +49,14 @@ def load_texts(path): ''' path = ensure_path(path) with path.open('r', encoding='utf8') as file_: - texts = [json.loads(line)['text'] for line in file_] + texts = [json.loads(line) for line in file_] random.shuffle(texts) return texts + def stream_texts(): for line in sys.stdin: - yield json.loads(line)['text'] + yield json.loads(line) def make_update(model, docs, optimizer, drop=0.): @@ -65,11 +68,33 @@ def make_update(model, docs, optimizer, drop=0.): RETURNS loss: A float for the loss. """ predictions, backprop = model.begin_update(docs, drop=drop) - loss, gradients = get_vectors_loss(model.ops, docs, predictions) + gradients = get_vectors_loss(model.ops, docs, predictions) backprop(gradients, sgd=optimizer) + # Don't want to return a cupy object here + # The gradients are modified in-place by the BERT MLM, + # so we get an accurate loss + loss = float((gradients**2).mean()) return loss +def make_docs(nlp, batch): + docs = [] + for record in batch: + text = record["text"] + if "tokens" in record: + doc = Doc(nlp.vocab, words=record["tokens"]) + else: + doc = nlp.make_doc(text) + if "heads" in record: + heads = record["heads"] + heads = numpy.asarray(heads, dtype="uint64") + heads = heads.reshape((len(doc), 1)) + doc = doc.from_array([HEAD], heads) + if len(doc) >= 1 and len(doc) < 200: + docs.append(doc) + return docs + + def get_vectors_loss(ops, docs, prediction): """Compute a mean-squared error loss between the documents' vectors and the prediction. @@ -84,10 +109,8 @@ def get_vectors_loss(ops, docs, prediction): # and look them up all at once. This prevents data copying. ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) target = docs[0].vocab.vectors.data[ids] - d_scores = (prediction - target) / prediction.shape[0] - # Don't want to return a cupy object here - loss = float((d_scores**2).sum()) - return loss, d_scores + d_scores = prediction - target + return d_scores def create_pretraining_model(nlp, tok2vec): @@ -107,15 +130,77 @@ def create_pretraining_model(nlp, tok2vec): tok2vec, output_layer ) + model = masked_language_model(nlp.vocab, model) model.tok2vec = tok2vec model.output_layer = output_layer model.begin_training([nlp.make_doc('Give it a doc to infer shapes')]) return model +def masked_language_model(vocab, model, mask_prob=0.15): + '''Convert a model into a BERT-style masked language model''' + vocab_words = [lex.text for lex in vocab if lex.prob != 0.0] + vocab_probs = [lex.prob for lex in vocab if lex.prob != 0.0] + vocab_words = vocab_words[:10000] + vocab_probs = vocab_probs[:10000] + vocab_probs = numpy.exp(numpy.array(vocab_probs, dtype='f')) + vocab_probs /= vocab_probs.sum() + + def mlm_forward(docs, drop=0.): + mask, docs = apply_mask(docs, vocab_words, vocab_probs, + mask_prob=mask_prob) + mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) + output, backprop = model.begin_update(docs, drop=drop) + + def mlm_backward(d_output, sgd=None): + d_output *= 1-mask + return backprop(d_output, sgd=sgd) + + return output, mlm_backward + + return wrap(mlm_forward, model) + + +def apply_mask(docs, vocab_texts, vocab_probs, mask_prob=0.15): + N = sum(len(doc) for doc in docs) + mask = numpy.random.uniform(0., 1.0, (N,)) + mask = mask >= mask_prob + i = 0 + masked_docs = [] + for doc in docs: + words = [] + for token in doc: + if not mask[i]: + word = replace_word(token.text, vocab_texts, vocab_probs) + else: + word = token.text + words.append(word) + i += 1 + spaces = [bool(w.whitespace_) for w in doc] + # NB: If you change this implementation to instead modify + # the docs in place, take care that the IDs reflect the original + # words. Currently we use the original docs to make the vectors + # for the target, so we don't lose the original tokens. But if + # you modified the docs in place here, you would. + masked_docs.append(Doc(doc.vocab, words=words, spaces=spaces)) + return mask, masked_docs + + +def replace_word(word, vocab_texts, vocab_probs, mask='[MASK]'): + roll = random.random() + if roll < 0.8: + return mask + elif roll < 0.9: + index = numpy.random.choice(len(vocab_texts), p=vocab_probs) + return vocab_texts[index] + else: + return word + + class ProgressTracker(object): def __init__(self, frequency=100000): - self.loss = 0. + self.loss = 0.0 + self.prev_loss = 0.0 self.nr_word = 0 self.words_per_epoch = Counter() self.frequency = frequency @@ -132,7 +217,15 @@ class ProgressTracker(object): wps = words_since_update / (time.time() - self.last_time) self.last_update = self.nr_word self.last_time = time.time() - status = (epoch, self.nr_word, '%.5f' % self.loss, int(wps)) + loss_per_word = self.loss - self.prev_loss + status = ( + epoch, + self.nr_word, + "%.5f" % self.loss, + "%.4f" % loss_per_word, + int(wps), + ) + self.prev_loss = float(self.loss) return status else: return None @@ -145,12 +238,13 @@ class ProgressTracker(object): width=("Width of CNN layers", "option", "cw", int), depth=("Depth of CNN layers", "option", "cd", int), embed_rows=("Embedding rows", "option", "er", int), + use_vectors=("Whether to use the static vectors as input features", "flag", "uv"), dropout=("Dropout", "option", "d", float), seed=("Seed for random number generators", "option", "s", float), nr_iter=("Number of iterations to pretrain", "option", "i", int), ) def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4, - embed_rows=1000, dropout=0.2, nr_iter=10, seed=0): + embed_rows=5000, use_vectors=False, dropout=0.2, nr_iter=100, seed=0): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate language-modelling objective. Specifically, we load @@ -175,11 +269,13 @@ def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4, with (output_dir / 'config.json').open('w') as file_: file_.write(json.dumps(config)) has_gpu = prefer_gpu() + print("Use GPU?", has_gpu) nlp = spacy.load(vectors_model) + pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name model = create_pretraining_model(nlp, Tok2Vec(width, embed_rows, conv_depth=depth, - pretrained_vectors=nlp.vocab.vectors.name, + pretrained_vectors=pretrained_vectors, bilstm_depth=0, # Requires PyTorch. Experimental. cnn_maxout_pieces=2, # You can try setting this higher subword_features=True)) # Set to False for character models, e.g. Chinese @@ -188,19 +284,19 @@ def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4, print('Epoch', '#Words', 'Loss', 'w/s') texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc) for epoch in range(nr_iter): - for batch in minibatch(texts, size=64): - docs = [nlp.make_doc(text) for text in batch] + for batch in minibatch(texts, size=256): + docs = make_docs(nlp, batch) loss = make_update(model, docs, optimizer, drop=dropout) progress = tracker.update(epoch, loss, docs) if progress: print(*progress) - if texts_loc == '-' and tracker.words_per_epoch[epoch] >= 10**6: + if texts_loc == '-' and tracker.words_per_epoch[epoch] >= 10**7: break with model.use_params(optimizer.averages): with (output_dir / ('model%d.bin' % epoch)).open('wb') as file_: file_.write(model.tok2vec.to_bytes()) with (output_dir / 'log.jsonl').open('a') as file_: file_.write(json.dumps({'nr_word': tracker.nr_word, - 'loss': tracker.loss, 'epoch': epoch})) + 'loss': tracker.loss, 'epoch': epoch}) + '\n') if texts_loc != '-': texts = load_texts(texts_loc) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 01aebfae8..01c8cb199 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -90,11 +90,11 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. - dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2), - util.env_opt('dropout_to', 0.2), + dropout_rates = util.decaying(util.env_opt('dropout_from', 0.1), + util.env_opt('dropout_to', 0.1), util.env_opt('dropout_decay', 0.0)) - batch_sizes = util.compounding(util.env_opt('batch_from', 1000), - util.env_opt('batch_to', 1000), + batch_sizes = util.compounding(util.env_opt('batch_from', 750), + util.env_opt('batch_to', 750), util.env_opt('batch_compound', 1.001)) lang_class = util.get_lang_class(lang) nlp = lang_class() diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 20a319f5d..63d8e0733 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -25,6 +25,7 @@ from .compat import json_dumps from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek + def tags_to_entities(tags): entities = [] start = None @@ -110,19 +111,23 @@ class GoldCorpus(object): # Write temp directory with one doc per file, so we can shuffle # and stream self.tmp_dir = Path(tempfile.mkdtemp()) - self.write_msgpack(self.tmp_dir / 'train', train) - self.write_msgpack(self.tmp_dir / 'dev', dev) + self.write_msgpack(self.tmp_dir / 'train', train, limit=self.limit) + self.write_msgpack(self.tmp_dir / 'dev', dev, limit=self.limit) def __del__(self): shutil.rmtree(self.tmp_dir) @staticmethod - def write_msgpack(directory, doc_tuples): + def write_msgpack(directory, doc_tuples, limit=0): if not directory.exists(): directory.mkdir() + n = 0 for i, doc_tuple in enumerate(doc_tuples): with open(directory / '{}.msg'.format(i), 'wb') as file_: - msgpack.dump([doc_tuple], file_, use_bin_type=True, encoding='utf8') + msgpack.dump([doc_tuple], file_, use_bin_type=True) + n += len(doc_tuple[1]) + if limit and n >= limit: + break @staticmethod def walk_corpus(path): @@ -153,7 +158,7 @@ class GoldCorpus(object): gold_tuples = read_json_file(loc) elif loc.parts[-1].endswith('msg'): with loc.open('rb') as file_: - gold_tuples = msgpack.load(file_, encoding='utf8') + gold_tuples = msgpack.load(file_, raw=False) else: msg = "Cannot read from file: %s. Supported formats: .json, .msg" raise ValueError(msg % loc) @@ -350,7 +355,7 @@ def _json_iterate(loc): py_str = py_raw[start : i+1].decode('utf8') try: yield json.loads(py_str) - except: + except Exception: print(py_str) raise start = -1 diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index f7c4ec4e0..e2a244080 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -759,7 +759,7 @@ class Tagger(Pipe): if self.model is True: token_vector_width = util.env_opt( 'token_vector_width', - self.cfg.get('token_vector_width', 128)) + self.cfg.get('token_vector_width', 96)) self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) self.model.from_bytes(b) @@ -878,7 +878,7 @@ class MultitaskObjective(Tagger): @classmethod def Model(cls, n_tags, tok2vec=None, **cfg): - token_vector_width = util.env_opt('token_vector_width', 128) + token_vector_width = util.env_opt('token_vector_width', 96) softmax = Softmax(n_tags, token_vector_width) model = chain( tok2vec, diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 82e87ae61..0cecdb93b 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -63,9 +63,9 @@ cdef class Parser: parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2)) token_vector_width = util.env_opt('token_vector_width', - cfg.get('token_vector_width', 128)) - hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128)) - embed_size = util.env_opt('embed_size', cfg.get('embed_size', 5000)) + cfg.get('token_vector_width', 96)) + hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 64)) + embed_size = util.env_opt('embed_size', cfg.get('embed_size', 2000)) pretrained_vectors = cfg.get('pretrained_vectors', None) tok2vec = Tok2Vec(token_vector_width, embed_size, conv_depth=conv_depth,