diff --git a/examples/experiments/onto-ner.cfg b/examples/experiments/onto-ner.cfg index 48fe25a67..8970bb3c0 100644 --- a/examples/experiments/onto-ner.cfg +++ b/examples/experiments/onto-ner.cfg @@ -9,12 +9,12 @@ max_length = 5000 limit = 0 # Data augmentation orth_variant_level = 0.0 -dropout = 0.2 +dropout = 0.1 # Controls early-stopping. 0 or -1 mean unlimited. -patience = 1600 +patience = 100000 max_epochs = 0 -max_steps = 20000 -eval_frequency = 500 +max_steps = 100000 +eval_frequency = 2000 # Other settings seed = 0 accumulate_gradient = 1 @@ -30,25 +30,25 @@ omit_extra_lookups = false [training.batch_size] @schedules = "compounding.v1" start = 100 -stop = 1000 +stop = 2000 compound = 1.001 [training.optimizer] @optimizers = "Adam.v1" beta1 = 0.9 beta2 = 0.999 -L2_is_weight_decay = false -L2 = 1e-6 +L2_is_weight_decay = true +L2 = 0.0 grad_clip = 1.0 use_averages = true eps = 1e-8 learn_rate = 0.001 -#[optimizer.learn_rate] +#[training.optimizer.learn_rate] #@schedules = "warmup_linear.v1" -#warmup_steps = 250 -#total_steps = 20000 -#initial_rate = 0.001 +#warmup_steps = 1000 +#total_steps = 50000 +#initial_rate = 0.003 [nlp] lang = "en" @@ -58,23 +58,21 @@ vectors = null factory = "ner" learn_tokens = false min_action_freq = 1 -beam_width = 1 -beam_update_prob = 1.0 [nlp.pipeline.ner.model] @architectures = "spacy.TransitionBasedParser.v1" nr_feature_tokens = 3 hidden_width = 64 maxout_pieces = 2 -use_upper = true +use_upper = false [nlp.pipeline.ner.model.tok2vec] @architectures = "spacy.HashEmbedCNN.v1" pretrained_vectors = ${nlp:vectors} -width = 96 +width = 300 depth = 4 window_size = 1 -embed_size = 2000 -maxout_pieces = 3 +embed_size = 7000 +maxout_pieces = 1 subword_features = true dropout = ${training:dropout}