2020-09-28 10:31:54 +00:00
|
|
|
[paths]
|
|
|
|
raw_text = null
|
|
|
|
|
2020-08-24 13:56:03 +00:00
|
|
|
[pretraining]
|
|
|
|
max_epochs = 1000
|
|
|
|
dropout = 0.2
|
|
|
|
n_save_every = null
|
2020-09-14 23:12:02 +00:00
|
|
|
component = "tok2vec"
|
|
|
|
layer = ""
|
2020-09-17 09:38:59 +00:00
|
|
|
corpus = "corpora.pretrain"
|
2020-09-14 23:12:02 +00:00
|
|
|
|
|
|
|
[pretraining.batcher]
|
|
|
|
@batchers = "spacy.batch_by_words.v1"
|
|
|
|
size = 3000
|
|
|
|
discard_oversize = false
|
|
|
|
tolerance = 0.2
|
|
|
|
get_length = null
|
|
|
|
|
2020-08-24 13:56:03 +00:00
|
|
|
[pretraining.objective]
|
2020-12-08 06:41:03 +00:00
|
|
|
@architectures = "spacy.PretrainCharacters.v1"
|
|
|
|
maxout_pieces = 3
|
|
|
|
hidden_size = 300
|
2020-08-24 13:56:03 +00:00
|
|
|
n_characters = 4
|
|
|
|
|
|
|
|
[pretraining.optimizer]
|
|
|
|
@optimizers = "Adam.v1"
|
|
|
|
beta1 = 0.9
|
|
|
|
beta2 = 0.999
|
|
|
|
L2_is_weight_decay = true
|
|
|
|
L2 = 0.01
|
|
|
|
grad_clip = 1.0
|
|
|
|
use_averages = true
|
|
|
|
eps = 1e-8
|
|
|
|
learn_rate = 0.001
|
2020-09-17 09:38:59 +00:00
|
|
|
|
|
|
|
[corpora]
|
|
|
|
|
|
|
|
[corpora.pretrain]
|
2020-10-01 23:36:06 +00:00
|
|
|
@readers = "spacy.JsonlCorpus.v1"
|
2020-09-28 10:05:23 +00:00
|
|
|
path = ${paths.raw_text}
|
2020-09-17 09:38:59 +00:00
|
|
|
min_length = 5
|
|
|
|
max_length = 500
|
|
|
|
limit = 0
|