mirror of https://github.com/explosion/spaCy.git
Update pretrain command
This commit is contained in:
parent
73255091f8
commit
008e1ee1dd
|
@ -28,8 +28,9 @@ from spacy.tokens import Doc
|
||||||
from spacy.attrs import ID, HEAD
|
from spacy.attrs import ID, HEAD
|
||||||
from spacy.util import minibatch, minibatch_by_words, use_gpu, compounding, ensure_path
|
from spacy.util import minibatch, minibatch_by_words, use_gpu, compounding, ensure_path
|
||||||
from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
|
from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
|
||||||
from thinc.v2v import Affine
|
from thinc.v2v import Affine, Maxout
|
||||||
from thinc.api import wrap
|
from thinc.api import wrap
|
||||||
|
from thinc.misc import LayerNorm as LN
|
||||||
|
|
||||||
|
|
||||||
def prefer_gpu():
|
def prefer_gpu():
|
||||||
|
@ -120,7 +121,10 @@ def create_pretraining_model(nlp, tok2vec):
|
||||||
Each array in the output needs to have one row per token in the doc.
|
Each array in the output needs to have one row per token in the doc.
|
||||||
'''
|
'''
|
||||||
output_size = nlp.vocab.vectors.data.shape[1]
|
output_size = nlp.vocab.vectors.data.shape[1]
|
||||||
output_layer = zero_init(Affine(output_size, drop_factor=0.0))
|
output_layer = chain(
|
||||||
|
LN(Maxout(300, pieces=3)),
|
||||||
|
zero_init(Affine(output_size, drop_factor=0.0))
|
||||||
|
)
|
||||||
# This is annoying, but the parser etc have the flatten step after
|
# This is annoying, but the parser etc have the flatten step after
|
||||||
# the tok2vec. To load the weights in cleanly, we need to match
|
# the tok2vec. To load the weights in cleanly, we need to match
|
||||||
# the shape of the models' components exactly. So what we cann
|
# the shape of the models' components exactly. So what we cann
|
||||||
|
@ -139,16 +143,10 @@ def create_pretraining_model(nlp, tok2vec):
|
||||||
|
|
||||||
def masked_language_model(vocab, model, mask_prob=0.15):
|
def masked_language_model(vocab, model, mask_prob=0.15):
|
||||||
'''Convert a model into a BERT-style masked language model'''
|
'''Convert a model into a BERT-style masked language model'''
|
||||||
vocab_words = [lex.text for lex in vocab if lex.prob != 0.0]
|
|
||||||
vocab_probs = [lex.prob for lex in vocab if lex.prob != 0.0]
|
random_words = RandomWords(vocab)
|
||||||
vocab_words = vocab_words[:10000]
|
|
||||||
vocab_probs = vocab_probs[:10000]
|
|
||||||
vocab_probs = numpy.exp(numpy.array(vocab_probs, dtype='f'))
|
|
||||||
vocab_probs /= vocab_probs.sum()
|
|
||||||
|
|
||||||
def mlm_forward(docs, drop=0.):
|
def mlm_forward(docs, drop=0.):
|
||||||
mask, docs = apply_mask(docs, vocab_words, vocab_probs,
|
mask, docs = apply_mask(docs, random_words, mask_prob=mask_prob)
|
||||||
mask_prob=mask_prob)
|
|
||||||
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
|
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
|
||||||
output, backprop = model.begin_update(docs, drop=drop)
|
output, backprop = model.begin_update(docs, drop=drop)
|
||||||
|
|
||||||
|
@ -161,7 +159,7 @@ def masked_language_model(vocab, model, mask_prob=0.15):
|
||||||
return wrap(mlm_forward, model)
|
return wrap(mlm_forward, model)
|
||||||
|
|
||||||
|
|
||||||
def apply_mask(docs, vocab_texts, vocab_probs, mask_prob=0.15):
|
def apply_mask(docs, random_words, mask_prob=0.15):
|
||||||
N = sum(len(doc) for doc in docs)
|
N = sum(len(doc) for doc in docs)
|
||||||
mask = numpy.random.uniform(0., 1.0, (N,))
|
mask = numpy.random.uniform(0., 1.0, (N,))
|
||||||
mask = mask >= mask_prob
|
mask = mask >= mask_prob
|
||||||
|
@ -171,7 +169,7 @@ def apply_mask(docs, vocab_texts, vocab_probs, mask_prob=0.15):
|
||||||
words = []
|
words = []
|
||||||
for token in doc:
|
for token in doc:
|
||||||
if not mask[i]:
|
if not mask[i]:
|
||||||
word = replace_word(token.text, vocab_texts, vocab_probs)
|
word = replace_word(token.text, random_words)
|
||||||
else:
|
else:
|
||||||
word = token.text
|
word = token.text
|
||||||
words.append(word)
|
words.append(word)
|
||||||
|
@ -186,19 +184,35 @@ def apply_mask(docs, vocab_texts, vocab_probs, mask_prob=0.15):
|
||||||
return mask, masked_docs
|
return mask, masked_docs
|
||||||
|
|
||||||
|
|
||||||
def replace_word(word, vocab_texts, vocab_probs, mask='[MASK]'):
|
def replace_word(word, random_words, mask='[MASK]'):
|
||||||
roll = random.random()
|
roll = random.random()
|
||||||
if roll < 0.8:
|
if roll < 0.8:
|
||||||
return mask
|
return mask
|
||||||
elif roll < 0.9:
|
elif roll < 0.9:
|
||||||
index = numpy.random.choice(len(vocab_texts), p=vocab_probs)
|
return random_words.next()
|
||||||
return vocab_texts[index]
|
|
||||||
else:
|
else:
|
||||||
return word
|
return word
|
||||||
|
|
||||||
|
class RandomWords(object):
|
||||||
|
def __init__(self, vocab):
|
||||||
|
self.words = [lex.text for lex in vocab if lex.prob != 0.0]
|
||||||
|
self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
|
||||||
|
self.words = self.words[:10000]
|
||||||
|
self.probs = self.probs[:10000]
|
||||||
|
self.probs = numpy.exp(numpy.array(self.probs, dtype='f'))
|
||||||
|
self.probs /= self.probs.sum()
|
||||||
|
self._cache = []
|
||||||
|
|
||||||
|
def next(self):
|
||||||
|
if not self._cache:
|
||||||
|
self._cache.extend(numpy.random.choice(len(self.words), 10000,
|
||||||
|
p=self.probs))
|
||||||
|
index = self._cache.pop()
|
||||||
|
return self.words[index]
|
||||||
|
|
||||||
|
|
||||||
class ProgressTracker(object):
|
class ProgressTracker(object):
|
||||||
def __init__(self, frequency=100000):
|
def __init__(self, frequency=1000000):
|
||||||
self.loss = 0.0
|
self.loss = 0.0
|
||||||
self.prev_loss = 0.0
|
self.prev_loss = 0.0
|
||||||
self.nr_word = 0
|
self.nr_word = 0
|
||||||
|
@ -206,9 +220,11 @@ class ProgressTracker(object):
|
||||||
self.frequency = frequency
|
self.frequency = frequency
|
||||||
self.last_time = time.time()
|
self.last_time = time.time()
|
||||||
self.last_update = 0
|
self.last_update = 0
|
||||||
|
self.epoch_loss = 0.0
|
||||||
|
|
||||||
def update(self, epoch, loss, docs):
|
def update(self, epoch, loss, docs):
|
||||||
self.loss += loss
|
self.loss += loss
|
||||||
|
self.epoch_loss += loss
|
||||||
words_in_batch = sum(len(doc) for doc in docs)
|
words_in_batch = sum(len(doc) for doc in docs)
|
||||||
self.words_per_epoch[epoch] += words_in_batch
|
self.words_per_epoch[epoch] += words_in_batch
|
||||||
self.nr_word += words_in_batch
|
self.nr_word += words_in_batch
|
||||||
|
@ -243,8 +259,8 @@ class ProgressTracker(object):
|
||||||
seed=("Seed for random number generators", "option", "s", float),
|
seed=("Seed for random number generators", "option", "s", float),
|
||||||
nr_iter=("Number of iterations to pretrain", "option", "i", int),
|
nr_iter=("Number of iterations to pretrain", "option", "i", int),
|
||||||
)
|
)
|
||||||
def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4,
|
def pretrain(texts_loc, vectors_model, output_dir, width=96, depth=4,
|
||||||
embed_rows=5000, use_vectors=False, dropout=0.2, nr_iter=100, seed=0):
|
embed_rows=2000, use_vectors=False, dropout=0.2, nr_iter=1000, seed=0):
|
||||||
"""
|
"""
|
||||||
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
||||||
using an approximate language-modelling objective. Specifically, we load
|
using an approximate language-modelling objective. Specifically, we load
|
||||||
|
@ -284,8 +300,8 @@ def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4,
|
||||||
print('Epoch', '#Words', 'Loss', 'w/s')
|
print('Epoch', '#Words', 'Loss', 'w/s')
|
||||||
texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc)
|
texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc)
|
||||||
for epoch in range(nr_iter):
|
for epoch in range(nr_iter):
|
||||||
for batch in minibatch(texts, size=256):
|
for batch in minibatch_by_words(((text, None) for text in texts), size=5000):
|
||||||
docs = make_docs(nlp, batch)
|
docs = make_docs(nlp, [text for (text, _) in batch])
|
||||||
loss = make_update(model, docs, optimizer, drop=dropout)
|
loss = make_update(model, docs, optimizer, drop=dropout)
|
||||||
progress = tracker.update(epoch, loss, docs)
|
progress = tracker.update(epoch, loss, docs)
|
||||||
if progress:
|
if progress:
|
||||||
|
@ -297,6 +313,8 @@ def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4,
|
||||||
file_.write(model.tok2vec.to_bytes())
|
file_.write(model.tok2vec.to_bytes())
|
||||||
with (output_dir / 'log.jsonl').open('a') as file_:
|
with (output_dir / 'log.jsonl').open('a') as file_:
|
||||||
file_.write(json.dumps({'nr_word': tracker.nr_word,
|
file_.write(json.dumps({'nr_word': tracker.nr_word,
|
||||||
'loss': tracker.loss, 'epoch': epoch}) + '\n')
|
'loss': tracker.loss, 'epoch_loss': tracker.epoch_loss,
|
||||||
|
'epoch': epoch}) + '\n')
|
||||||
|
tracker.epoch_loss = 0.0
|
||||||
if texts_loc != '-':
|
if texts_loc != '-':
|
||||||
texts = load_texts(texts_loc)
|
texts = load_texts(texts_loc)
|
||||||
|
|
Loading…
Reference in New Issue