From e0f9f448f1305e382c5e7042d8bbac882fea9644 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 1 Jun 2020 23:38:48 +0200 Subject: [PATCH 01/13] remove Tensorizer --- examples/training/pretrain_textcat.py | 212 ------------------ spacy/language.py | 4 - spacy/ml/models/__init__.py | 1 - spacy/ml/models/tensorizer.py | 10 - spacy/pipeline/__init__.py | 3 +- spacy/pipeline/defaults/__init__.py | 10 - .../pipeline/defaults/tensorizer_defaults.cfg | 4 - spacy/pipeline/hooks.py | 6 +- spacy/pipeline/pipes.pyx | 136 +---------- .../serialize/test_serialize_pipeline.py | 22 +- 10 files changed, 8 insertions(+), 400 deletions(-) delete mode 100644 examples/training/pretrain_textcat.py delete mode 100644 spacy/ml/models/tensorizer.py delete mode 100644 spacy/pipeline/defaults/tensorizer_defaults.cfg diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py deleted file mode 100644 index 5c41c0e92..000000000 --- a/examples/training/pretrain_textcat.py +++ /dev/null @@ -1,212 +0,0 @@ -"""This script is experimental. - -Try pre-training the CNN component of the text categorizer using a cheap -language modelling-like objective. Specifically, we load pretrained vectors -(from something like word2vec, GloVe, FastText etc), and use the CNN to -predict the tokens' pretrained vectors. This isn't as easy as it sounds: -we're not merely doing compression here, because heavy dropout is applied, -including over the input words. This means the model must often (50% of the time) -use the context in order to predict the word. - -To evaluate the technique, we're pre-training with the 50k texts from the IMDB -corpus, and then training with only 100 labels. Note that it's a bit dirty to -pre-train with the development data, but also not *so* terrible: we're not using -the development labels, after all --- only the unlabelled text. -""" -import plac -import tqdm -import random - -import ml_datasets - -import spacy -from spacy.util import minibatch -from spacy.pipeline import TextCategorizer -from spacy.ml.models.tok2vec import build_Tok2Vec_model -import numpy - - -def load_texts(limit=0): - train, dev = ml_datasets.imdb() - train_texts, train_labels = zip(*train) - dev_texts, dev_labels = zip(*train) - train_texts = list(train_texts) - dev_texts = list(dev_texts) - random.shuffle(train_texts) - random.shuffle(dev_texts) - if limit >= 1: - return train_texts[:limit] - else: - return list(train_texts) + list(dev_texts) - - -def load_textcat_data(limit=0): - """Load data from the IMDB dataset.""" - # Partition off part of the train data for evaluation - train_data, eval_data = ml_datasets.imdb() - random.shuffle(train_data) - train_data = train_data[-limit:] - texts, labels = zip(*train_data) - eval_texts, eval_labels = zip(*eval_data) - cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels] - eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels] - return (texts, cats), (eval_texts, eval_cats) - - -def prefer_gpu(): - used = spacy.util.use_gpu(0) - if used is None: - return False - else: - import cupy.random - - cupy.random.seed(0) - return True - - -def build_textcat_model(tok2vec, nr_class, width): - from thinc.api import Model, Softmax, chain, reduce_mean, list2ragged - - with Model.define_operators({">>": chain}): - model = ( - tok2vec - >> list2ragged() - >> reduce_mean() - >> Softmax(nr_class, width) - ) - model.set_ref("tok2vec", tok2vec) - return model - - -def block_gradients(model): - from thinc.api import wrap # TODO FIX - - def forward(X, drop=0.0): - Y, _ = model.begin_update(X, drop=drop) - return Y, None - - return wrap(forward, model) - - -def create_pipeline(width, embed_size, vectors_model): - print("Load vectors") - nlp = spacy.load(vectors_model) - print("Start training") - textcat = TextCategorizer( - nlp.vocab, - labels=["POSITIVE", "NEGATIVE"], - # TODO: replace with config version - model=build_textcat_model( - build_Tok2Vec_model(width=width, embed_size=embed_size), 2, width - ), - ) - - nlp.add_pipe(textcat) - return nlp - - -def train_tensorizer(nlp, texts, dropout, n_iter): - tensorizer = nlp.create_pipe("tensorizer") - nlp.add_pipe(tensorizer) - optimizer = nlp.begin_training() - for i in range(n_iter): - losses = {} - for i, batch in enumerate(minibatch(tqdm.tqdm(texts))): - docs = [nlp.make_doc(text) for text in batch] - tensorizer.update((docs, None), losses=losses, sgd=optimizer, drop=dropout) - print(losses) - return optimizer - - -def train_textcat(nlp, n_texts, n_iter=10): - textcat = nlp.get_pipe("textcat") - tok2vec_weights = textcat.model.get_ref("tok2vec").to_bytes() - (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts) - print( - "Using {} examples ({} training, {} evaluation)".format( - n_texts, len(train_texts), len(dev_texts) - ) - ) - train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats])) - - with nlp.select_pipes(enable="textcat"): # only train textcat - optimizer = nlp.begin_training() - textcat.model.get_ref("tok2vec").from_bytes(tok2vec_weights) - print("Training the model...") - print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F")) - for i in range(n_iter): - losses = {"textcat": 0.0} - # batch up the examples using spaCy's minibatch - batches = minibatch(tqdm.tqdm(train_data), size=2) - for batch in batches: - nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses) - with textcat.model.use_params(optimizer.averages): - # evaluate on the dev data split off in load_data() - scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats) - print( - "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table - losses["textcat"], - scores["textcat_p"], - scores["textcat_r"], - scores["textcat_f"], - ) - ) - - -def evaluate_textcat(tokenizer, textcat, texts, cats): - docs = (tokenizer(text) for text in texts) - tp = 1e-8 - fp = 1e-8 - tn = 1e-8 - fn = 1e-8 - for i, doc in enumerate(textcat.pipe(docs)): - gold = cats[i] - for label, score in doc.cats.items(): - if label not in gold: - continue - if score >= 0.5 and gold[label] >= 0.5: - tp += 1.0 - elif score >= 0.5 and gold[label] < 0.5: - fp += 1.0 - elif score < 0.5 and gold[label] < 0.5: - tn += 1 - elif score < 0.5 and gold[label] >= 0.5: - fn += 1 - precision = tp / (tp + fp) - recall = tp / (tp + fn) - f_score = 2 * (precision * recall) / (precision + recall) - return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score} - - -@plac.annotations( - width=("Width of CNN layers", "positional", None, int), - embed_size=("Embedding rows", "positional", None, int), - pretrain_iters=("Number of iterations to pretrain", "option", "pn", int), - train_iters=("Number of iterations to pretrain", "option", "tn", int), - train_examples=("Number of labelled examples", "option", "eg", int), - vectors_model=("Name or path to vectors model to learn from"), -) -def main( - width, - embed_size, - vectors_model, - pretrain_iters=30, - train_iters=30, - train_examples=1000, -): - random.seed(0) - numpy.random.seed(0) - use_gpu = prefer_gpu() - print("Using GPU?", use_gpu) - - nlp = create_pipeline(width, embed_size, vectors_model) - print("Load data") - texts = load_texts(limit=0) - print("Train tensorizer") - optimizer = train_tensorizer(nlp, texts, dropout=0.2, n_iter=pretrain_iters) - print("Train textcat") - train_textcat(nlp, train_examples, n_iter=train_iters) - - -if __name__ == "__main__": - plac.call(main) diff --git a/spacy/language.py b/spacy/language.py index 61d69b63e..22360c65f 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -225,10 +225,6 @@ class Language(object): # Conveniences to access pipeline components # Shouldn't be used anymore! - @property - def tensorizer(self): - return self.get_pipe("tensorizer") - @property def tagger(self): return self.get_pipe("tagger") diff --git a/spacy/ml/models/__init__.py b/spacy/ml/models/__init__.py index ef1e8efca..40cde2437 100644 --- a/spacy/ml/models/__init__.py +++ b/spacy/ml/models/__init__.py @@ -2,6 +2,5 @@ from .entity_linker import * # noqa from .parser import * # noqa from .simple_ner import * from .tagger import * # noqa -from .tensorizer import * # noqa from .textcat import * # noqa from .tok2vec import * # noqa diff --git a/spacy/ml/models/tensorizer.py b/spacy/ml/models/tensorizer.py deleted file mode 100644 index f66610b64..000000000 --- a/spacy/ml/models/tensorizer.py +++ /dev/null @@ -1,10 +0,0 @@ -from thinc.api import Linear, zero_init - -from ... import util -from ...util import registry - - -@registry.architectures.register("spacy.Tensorizer.v1") -def build_tensorizer(input_size, output_size): - input_size = util.env_opt("token_vector_width", input_size) - return Linear(output_size, input_size, init_W=zero_init) diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index b2866bad2..116a08e92 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -1,5 +1,5 @@ from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker -from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer +from .pipes import TextCategorizer, Pipe, Sentencizer from .pipes import SentenceRecognizer from .simple_ner import SimpleNER from .morphologizer import Morphologizer @@ -14,7 +14,6 @@ __all__ = [ "EntityRecognizer", "EntityLinker", "TextCategorizer", - "Tensorizer", "Tok2Vec", "Pipe", "Morphologizer", diff --git a/spacy/pipeline/defaults/__init__.py b/spacy/pipeline/defaults/__init__.py index e17e2d3b4..483c6bbd6 100644 --- a/spacy/pipeline/defaults/__init__.py +++ b/spacy/pipeline/defaults/__init__.py @@ -63,16 +63,6 @@ def default_tagger(): return util.load_config(loc, create_objects=True)["model"] -def default_tensorizer_config(): - loc = Path(__file__).parent / "tensorizer_defaults.cfg" - return util.load_config(loc, create_objects=False) - - -def default_tensorizer(): - loc = Path(__file__).parent / "tensorizer_defaults.cfg" - return util.load_config(loc, create_objects=True)["model"] - - def default_textcat_config(): loc = Path(__file__).parent / "textcat_defaults.cfg" return util.load_config(loc, create_objects=False) diff --git a/spacy/pipeline/defaults/tensorizer_defaults.cfg b/spacy/pipeline/defaults/tensorizer_defaults.cfg deleted file mode 100644 index 81880a109..000000000 --- a/spacy/pipeline/defaults/tensorizer_defaults.cfg +++ /dev/null @@ -1,4 +0,0 @@ -[model] -@architectures = "spacy.Tensorizer.v1" -input_size=96 -output_size=300 diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py index 351323ae9..a97e7be68 100644 --- a/spacy/pipeline/hooks.py +++ b/spacy/pipeline/hooks.py @@ -44,8 +44,8 @@ class SentenceSegmenter(object): class SimilarityHook(Pipe): """ Experimental: A pipeline component to install a hook for supervised - similarity into `Doc` objects. Requires a `Tensorizer` to pre-process - documents. The similarity model can be any object obeying the Thinc `Model` + similarity into `Doc` objects. + The similarity model can be any object obeying the Thinc `Model` interface. By default, the model concatenates the elementwise mean and elementwise max of the two tensors, and compares them using the Cauchy-like similarity function from Chen (2013): @@ -82,7 +82,7 @@ class SimilarityHook(Pipe): sims, bp_sims = self.model.begin_update(doc1_doc2) def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs): - """Allocate model, using width from tensorizer in pipeline. + """Allocate model, using nO from the first model in the pipeline. gold_tuples (iterable): Gold-standard training data. pipeline (list): The pipeline the model is part of. diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index f75ed1659..cfe01981e 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -16,7 +16,7 @@ from ..morphology cimport Morphology from ..vocab cimport Vocab from .defaults import default_tagger, default_parser, default_ner, default_textcat -from .defaults import default_nel, default_senter, default_tensorizer +from .defaults import default_nel, default_senter from .functions import merge_subtokens from ..language import Language, component from ..syntax import nonproj @@ -238,138 +238,6 @@ class Pipe(object): return self -@component("tensorizer", assigns=["doc.tensor"], default_model=default_tensorizer) -class Tensorizer(Pipe): - """Pre-train position-sensitive vectors for tokens.""" - - def __init__(self, vocab, model, **cfg): - """Construct a new statistical model. Weights are not allocated on - initialisation. - - vocab (Vocab): A `Vocab` instance. The model must share the same - `Vocab` instance with the `Doc` objects it will process. - **cfg: Config parameters. - """ - self.vocab = vocab - self.model = model - self.input_models = [] - self.cfg = dict(cfg) - - def __call__(self, example): - """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM - model. Vectors are set to the `Doc.tensor` attribute. - - docs (Doc or iterable): One or more documents to add vectors to. - RETURNS (dict or None): Intermediate computations. - """ - doc = self._get_doc(example) - tokvecses = self.predict([doc]) - self.set_annotations([doc], tokvecses) - if isinstance(example, Example): - example.doc = doc - return example - return doc - - def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): - """Process `Doc` objects as a stream. - - stream (iterator): A sequence of `Doc` or `Example` objects to process. - batch_size (int): Number of `Doc` or `Example` objects to group. - YIELDS (iterator): A sequence of `Doc` or `Example` objects, in order of input. - """ - for examples in util.minibatch(stream, size=batch_size): - docs = [self._get_doc(ex) for ex in examples] - tensors = self.predict(docs) - self.set_annotations(docs, tensors) - - if as_example: - for ex, doc in zip(examples, docs): - ex.doc = doc - yield ex - else: - yield from docs - - def predict(self, docs): - """Return a single tensor for a batch of documents. - - docs (iterable): A sequence of `Doc` objects. - RETURNS (object): Vector representations for each token in the docs. - """ - inputs = self.model.ops.flatten([doc.tensor for doc in docs]) - outputs = self.model(inputs) - return self.model.ops.unflatten(outputs, [len(d) for d in docs]) - - def set_annotations(self, docs, tensors): - """Set the tensor attribute for a batch of documents. - - docs (iterable): A sequence of `Doc` objects. - tensors (object): Vector representation for each token in the docs. - """ - for doc, tensor in zip(docs, tensors): - if tensor.shape[0] != len(doc): - raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc))) - doc.tensor = tensor - - def update(self, examples, state=None, drop=0.0, set_annotations=False, sgd=None, losses=None): - """Update the model. - - docs (iterable): A batch of `Doc` objects. - golds (iterable): A batch of `GoldParse` objects. - drop (float): The dropout rate. - sgd (callable): An optimizer. - RETURNS (dict): Results from the update. - """ - examples = Example.to_example_objects(examples) - inputs = [] - bp_inputs = [] - set_dropout_rate(self.model, drop) - for tok2vec in self.input_models: - set_dropout_rate(tok2vec, drop) - tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples]) - inputs.append(tensor) - bp_inputs.append(bp_tensor) - inputs = self.model.ops.xp.hstack(inputs) - scores, bp_scores = self.model.begin_update(inputs) - loss, d_scores = self.get_loss(examples, scores) - d_inputs = bp_scores(d_scores, sgd=sgd) - d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1) - for d_input, bp_input in zip(d_inputs, bp_inputs): - bp_input(d_input) - if sgd is not None: - for tok2vec in self.input_models: - tok2vec.finish_update(sgd) - self.model.finish_update(sgd) - if losses is not None: - losses.setdefault(self.name, 0.0) - losses[self.name] += loss - return loss - - def get_loss(self, examples, prediction): - examples = Example.to_example_objects(examples) - ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples]) - target = self.vocab.vectors.data[ids] - d_scores = (prediction - target) / prediction.shape[0] - loss = (d_scores ** 2).sum() - return loss, d_scores - - def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): - """Allocate models, pre-process training data and acquire an - optimizer. - - get_examples (iterable): Gold-standard training data. - pipeline (list): The pipeline the model is part of. - """ - if pipeline is not None: - for name, model in pipeline: - if model.has_ref("tok2vec"): - self.input_models.append(model.get_ref("tok2vec")) - self.model.initialize() - link_vectors_to_models(self.vocab) - if sgd is None: - sgd = self.create_optimizer() - return sgd - - @component("tagger", assigns=["token.tag", "token.pos", "token.lemma"], default_model=default_tagger) class Tagger(Pipe): """Pipeline component for part-of-speech tagging. @@ -1707,4 +1575,4 @@ def ner_factory(nlp, model, **cfg): warnings.warn(Warnings.W098.format(name="ner")) return EntityRecognizer.from_nlp(nlp, model, **cfg) -__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"] +__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"] diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 4fc277c4f..595a35a9f 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -1,7 +1,7 @@ import pytest from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer -from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer -from spacy.pipeline.defaults import default_parser, default_tensorizer, default_tagger +from spacy.pipeline import TextCategorizer, SentenceRecognizer +from spacy.pipeline.defaults import default_parser, default_tagger from spacy.pipeline.defaults import default_textcat, default_senter from ..util import make_tempdir @@ -95,24 +95,6 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers): assert tagger1_d.to_bytes() == tagger2_d.to_bytes() -def test_serialize_tensorizer_roundtrip_bytes(en_vocab): - tensorizer = Tensorizer(en_vocab, default_tensorizer()) - tensorizer_b = tensorizer.to_bytes(exclude=["vocab"]) - new_tensorizer = Tensorizer(en_vocab, default_tensorizer()).from_bytes(tensorizer_b) - assert new_tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_b - - -def test_serialize_tensorizer_roundtrip_disk(en_vocab): - tensorizer = Tensorizer(en_vocab, default_tensorizer()) - with make_tempdir() as d: - file_path = d / "tensorizer" - tensorizer.to_disk(file_path) - tensorizer_d = Tensorizer(en_vocab, default_tensorizer()).from_disk(file_path) - assert tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_d.to_bytes( - exclude=["vocab"] - ) - - def test_serialize_textcat_empty(en_vocab): # See issue #1105 textcat = TextCategorizer( From 03c58b488c2a28d70995447fba5ab6610520d970 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jun 2020 10:00:21 +0200 Subject: [PATCH 02/13] prevent infinite loop, custom warning --- spacy/cli/train_from_config.py | 27 ++++++++++++++++++--------- spacy/errors.py | 2 ++ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index c0e3bd169..852f456de 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -13,6 +13,7 @@ import random from ..gold import GoldCorpus from .. import util from ..errors import Errors +from ..ml import models # don't remove - required to load the built-in architectures registry = util.registry @@ -75,7 +76,7 @@ maxout_pieces = 3 subword_features = true """ - +# TODO: REMOVE ? class PipelineComponent(BaseModel): factory: str model: Model @@ -83,7 +84,7 @@ class PipelineComponent(BaseModel): class Config: arbitrary_types_allowed = True - +# TODO: REMOVE ? class ConfigSchema(BaseModel): optimizer: Optional["Optimizer"] @@ -123,7 +124,7 @@ class ConfigSchema(BaseModel): use_gpu=("Use GPU", "option", "g", int), # fmt: on ) -def train_from_config_cli( +def train_cli( train_path, dev_path, config_path, @@ -132,7 +133,7 @@ def train_from_config_cli( raw_text=None, debug=False, verbose=False, - use_gpu=-1 + use_gpu=-1, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's @@ -156,7 +157,7 @@ def train_from_config_cli( else: msg.info("Using CPU") - train_from_config( + train( config_path, {"train": train_path, "dev": dev_path}, output_path=output_path, @@ -165,10 +166,11 @@ def train_from_config_cli( ) -def train_from_config( +def train( config_path, data_paths, raw_text=None, meta_path=None, output_path=None, ): msg.info(f"Loading config from: {config_path}") + # Read the config first without creating objects, to get to the original nlp_config config = util.load_config(config_path, create_objects=False) util.fix_random_seed(config["training"]["seed"]) if config["training"]["use_pytorch_for_gpu_memory"]: @@ -177,8 +179,8 @@ def train_from_config( config = util.load_config(config_path, create_objects=True) msg.info("Creating nlp from config") nlp = util.load_model_from_config(nlp_config) - optimizer = config["optimizer"] training = config["training"] + optimizer = training["optimizer"] limit = training["limit"] msg.info("Loading training corpus") corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit) @@ -246,13 +248,19 @@ def create_train_batches(nlp, corpus, cfg): if len(train_examples) == 0: raise ValueError(Errors.E988) random.shuffle(train_examples) - batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"]) + batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"], discard_oversize=cfg["discard_oversize"]) + # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop + try: + first = next(batches) + yield first + except StopIteration: + raise ValueError(Errors.E986) for batch in batches: yield batch epochs_todo -= 1 # We intentionally compare exactly to 0 here, so that max_epochs < 1 # will not break. - if epochs_todo == 0: + if epochs_todo == 0: break @@ -366,6 +374,7 @@ def train_while_improving( # Stop if we've exhausted our max steps (if specified) if max_steps and (step * accumulate_gradient) >= max_steps: break + step += 1 def subdivide_batch(batch, accumulate_gradient): diff --git a/spacy/errors.py b/spacy/errors.py index da2cfdf04..852c55225 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -554,6 +554,8 @@ class Errors(object): # TODO: fix numbering after merging develop into master + E986 = ("Could not create any training batches: check your input. " + "Perhaps discard_oversize should be set to False ?") E987 = ("The text of an example training instance is either a Doc or " "a string, but found {type} instead.") E988 = ("Could not parse any training examples. Ensure the data is " From e91485dfc464744d1c2d1ea9e648efeea9e403a1 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jun 2020 10:04:16 +0200 Subject: [PATCH 03/13] add discard_oversize parameter, move optimizer to training subsection --- examples/experiments/onto-joint/defaults.cfg | 3 ++- examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg | 3 ++- examples/experiments/ptb-joint-pos-dep/defaults.cfg | 3 ++- examples/experiments/tok2vec-ner/charembed_tok2vec.cfg | 3 ++- .../experiments/tok2vec-ner/multihashembed_tok2vec.cfg | 3 ++- spacy/__main__.py | 7 +++---- spacy/cli/__init__.py | 3 +-- spacy/cli/train_from_config.py | 1 - spacy/ml/__init__.py | 1 + 9 files changed, 15 insertions(+), 12 deletions(-) diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg index fbac4ea7d..0fdbc5cf5 100644 --- a/examples/experiments/onto-joint/defaults.cfg +++ b/examples/experiments/onto-joint/defaults.cfg @@ -25,6 +25,7 @@ score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2} # These settings are invalid for the transformer models. init_tok2vec = null vectors = null +discard_oversize = false [training.batch_size] @schedules = "compounding.v1" @@ -32,7 +33,7 @@ start = 1000 stop = 1000 compound = 1.001 -[optimizer] +[training.optimizer] @optimizers = "Adam.v1" beta1 = 0.9 beta2 = 0.999 diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg index e152fa5e0..fdd4139f8 100644 --- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -14,6 +14,7 @@ score_weights = {"las": 0.8, "tags_acc": 0.2} limit = 0 seed = 0 accumulate_gradient = 2 +discard_oversize = false [training.batch_size] @schedules = "compounding.v1" @@ -21,7 +22,7 @@ start = 100 stop = 1000 compound = 1.001 -[optimizer] +[training.optimizer] @optimizers = "Adam.v1" learn_rate = 0.001 beta1 = 0.9 diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index 9a10c45f0..5b369d782 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -14,6 +14,7 @@ score_weights = {"las": 0.8, "tags_acc": 0.2} limit = 0 seed = 0 accumulate_gradient = 2 +discard_oversize = false [training.batch_size] @schedules = "compounding.v1" @@ -21,7 +22,7 @@ start = 100 stop = 1000 compound = 1.001 -[optimizer] +[training.optimizer] @optimizers = "Adam.v1" learn_rate = 0.001 beta1 = 0.9 diff --git a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg index 796c8670f..8e5c3a276 100644 --- a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg +++ b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg @@ -12,8 +12,9 @@ max_length = 0 batch_size = 25 seed = 0 accumulate_gradient = 2 +discard_oversize = false -[optimizer] +[training.optimizer] @optimizers = "Adam.v1" learn_rate = 0.001 beta1 = 0.9 diff --git a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg index 3ac70675b..149b8ea66 100644 --- a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg +++ b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg @@ -11,6 +11,7 @@ gold_preproc = true max_length = 0 seed = 0 accumulate_gradient = 2 +discard_oversize = false [training.batch_size] @schedules = "compounding.v1" @@ -19,7 +20,7 @@ stop = 3000 compound = 1.001 -[optimizer] +[training.optimizer] @optimizers = "Adam.v1" learn_rate = 0.001 beta1 = 0.9 diff --git a/spacy/__main__.py b/spacy/__main__.py index 71ab1a91a..beed3170d 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -2,16 +2,15 @@ if __name__ == "__main__": import plac import sys from wasabi import msg - from spacy.cli import download, link, info, package, train, pretrain, convert + from spacy.cli import download, link, info, package, pretrain, convert from spacy.cli import init_model, profile, evaluate, validate, debug_data - from spacy.cli import train_from_config_cli + from spacy.cli import train_cli commands = { "download": download, "link": link, "info": info, - "train": train, - "train-from-config": train_from_config_cli, + "train": train_cli, "pretrain": pretrain, "debug-data": debug_data, "evaluate": evaluate, diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 5f83b26c1..2ffbe2d0c 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -4,8 +4,7 @@ from .download import download # noqa: F401 from .info import info # noqa: F401 from .package import package # noqa: F401 from .profile import profile # noqa: F401 -from .train import train # noqa: F401 -from .train_from_config import train_from_config_cli # noqa: F401 +from .train_from_config import train_cli # noqa: F401 from .pretrain import pretrain # noqa: F401 from .debug_data import debug_data # noqa: F401 from .evaluate import evaluate # noqa: F401 diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 852f456de..9cdc3bf2f 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -374,7 +374,6 @@ def train_while_improving( # Stop if we've exhausted our max steps (if specified) if max_steps and (step * accumulate_gradient) >= max_steps: break - step += 1 def subdivide_batch(batch, accumulate_gradient): diff --git a/spacy/ml/__init__.py b/spacy/ml/__init__.py index e69de29bb..cf4f59d6c 100644 --- a/spacy/ml/__init__.py +++ b/spacy/ml/__init__.py @@ -0,0 +1 @@ +from .models import * \ No newline at end of file From eac12cbb773912d274a2e4eb5090b8fe89992ef4 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jun 2020 11:50:16 +0200 Subject: [PATCH 04/13] make dropout in embed layers configurable --- spacy/ml/models/textcat.py | 17 ++++---- spacy/ml/models/tok2vec.py | 40 ++++++++++--------- .../defaults/entity_linker_defaults.cfg | 1 + .../defaults/morphologizer_defaults.cfg | 1 + spacy/pipeline/defaults/ner_defaults.cfg | 1 + spacy/pipeline/defaults/parser_defaults.cfg | 1 + spacy/pipeline/defaults/senter_defaults.cfg | 1 + .../pipeline/defaults/simple_ner_defaults.cfg | 1 + spacy/pipeline/defaults/tagger_defaults.cfg | 1 + .../defaults/textcat_cnn_defaults.cfg | 1 + spacy/pipeline/defaults/textcat_defaults.cfg | 1 + spacy/pipeline/defaults/tok2vec_defaults.cfg | 1 + spacy/tests/pipeline/test_textcat.py | 6 +-- .../tests/serialize/test_serialize_config.py | 3 ++ spacy/tests/test_tok2vec.py | 19 ++++----- 15 files changed, 57 insertions(+), 38 deletions(-) diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index ce31d058c..141c66f79 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -49,13 +49,13 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO @registry.architectures.register("spacy.TextCat.v1") def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size, - window_size, conv_depth, nO=None): + window_size, conv_depth, dropout, nO=None): cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER)) - prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX)) - suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX)) - shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE)) + lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout) + prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout) + suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout) + shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout) width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) trained_vectors = FeatureExtractor(cols) >> with_array( @@ -114,7 +114,7 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class @registry.architectures.register("spacy.TextCatLowData.v1") -def build_text_classifier_lowdata(width, pretrained_vectors, nO=None): +def build_text_classifier_lowdata(width, pretrained_vectors, dropout, nO=None): nlp = util.load_model(pretrained_vectors) vectors = nlp.vocab.vectors vector_dim = vectors.data.shape[1] @@ -129,7 +129,8 @@ def build_text_classifier_lowdata(width, pretrained_vectors, nO=None): >> reduce_sum() >> residual(Relu(width, width)) ** 2 >> Linear(nO, width) - >> Dropout(0.0) - >> Logistic() ) + if dropout: + model = model >> Dropout(dropout) + model = model >> Logistic() return model diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index a2e8f589a..53798e57c 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -49,6 +49,7 @@ def hash_embed_cnn( maxout_pieces, window_size, subword_features, + dropout, ): # Does not use character embeddings: set to False by default return build_Tok2Vec_model( @@ -63,6 +64,7 @@ def hash_embed_cnn( char_embed=False, nM=0, nC=0, + dropout=dropout, ) @@ -76,6 +78,7 @@ def hash_charembed_cnn( window_size, nM, nC, + dropout, ): # Allows using character embeddings by setting nC, nM and char_embed=True return build_Tok2Vec_model( @@ -90,12 +93,13 @@ def hash_charembed_cnn( char_embed=True, nM=nM, nC=nC, + dropout=dropout, ) @registry.architectures.register("spacy.HashEmbedBiLSTM.v1") def hash_embed_bilstm_v1( - pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces + pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces, dropout ): # Does not use character embeddings: set to False by default return build_Tok2Vec_model( @@ -110,12 +114,13 @@ def hash_embed_bilstm_v1( char_embed=False, nM=0, nC=0, + dropout=dropout, ) @registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1") def hash_char_embed_bilstm_v1( - pretrained_vectors, width, depth, embed_size, maxout_pieces, nM, nC + pretrained_vectors, width, depth, embed_size, maxout_pieces, nM, nC, dropout ): # Allows using character embeddings by setting nC, nM and char_embed=True return build_Tok2Vec_model( @@ -130,6 +135,7 @@ def hash_char_embed_bilstm_v1( char_embed=True, nM=nM, nC=nC, + dropout=dropout, ) @@ -144,19 +150,19 @@ def LayerNormalizedMaxout(width, maxout_pieces): @registry.architectures.register("spacy.MultiHashEmbed.v1") -def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix): - norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM")) +def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix, dropout): + norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) if use_subwords: - prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX")) - suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX")) - shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE")) + prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout) + suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout) + shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout) if pretrained_vectors: glove = StaticVectors( vectors=pretrained_vectors.data, nO=width, column=columns.index(ID), - dropout=0.0, + dropout=dropout, ) with Model.define_operators({">>": chain, "|": concatenate}): @@ -164,13 +170,10 @@ def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix): embed_layer = norm else: if use_subwords and pretrained_vectors: - nr_columns = 5 concat_columns = glove | norm | prefix | suffix | shape elif use_subwords: - nr_columns = 4 concat_columns = norm | prefix | suffix | shape else: - nr_columns = 2 concat_columns = glove | norm embed_layer = uniqued(concat_columns >> mix, column=columns.index("ORTH")) @@ -179,8 +182,8 @@ def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix): @registry.architectures.register("spacy.CharacterEmbed.v1") -def CharacterEmbed(columns, width, rows, nM, nC, features): - norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM")) +def CharacterEmbed(columns, width, rows, nM, nC, features, dropout): + norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC) with Model.define_operators({">>": chain, "|": concatenate}): embed_layer = chr_embed | features >> with_array(norm) @@ -238,16 +241,17 @@ def build_Tok2Vec_model( nC, conv_depth, bilstm_depth, + dropout, ) -> Model: if char_embed: subword_features = False cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM)) + norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout) if subword_features: - prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX)) - suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX)) - shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE)) + prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout) + suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout) + shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout) else: prefix, suffix, shape = (None, None, None) if pretrained_vectors is not None: @@ -255,7 +259,7 @@ def build_Tok2Vec_model( vectors=pretrained_vectors.data, nO=width, column=cols.index(ID), - dropout=0.0, + dropout=dropout, ) if subword_features: diff --git a/spacy/pipeline/defaults/entity_linker_defaults.cfg b/spacy/pipeline/defaults/entity_linker_defaults.cfg index 6a591ec3e..26a294f37 100644 --- a/spacy/pipeline/defaults/entity_linker_defaults.cfg +++ b/spacy/pipeline/defaults/entity_linker_defaults.cfg @@ -10,3 +10,4 @@ embed_size = 300 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null \ No newline at end of file diff --git a/spacy/pipeline/defaults/morphologizer_defaults.cfg b/spacy/pipeline/defaults/morphologizer_defaults.cfg index 150eca507..c4452c689 100644 --- a/spacy/pipeline/defaults/morphologizer_defaults.cfg +++ b/spacy/pipeline/defaults/morphologizer_defaults.cfg @@ -11,3 +11,4 @@ window_size = 1 maxout_pieces = 3 nM = 64 nC = 8 +dropout = null \ No newline at end of file diff --git a/spacy/pipeline/defaults/ner_defaults.cfg b/spacy/pipeline/defaults/ner_defaults.cfg index db2c131f5..eb926c43b 100644 --- a/spacy/pipeline/defaults/ner_defaults.cfg +++ b/spacy/pipeline/defaults/ner_defaults.cfg @@ -13,3 +13,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/parser_defaults.cfg b/spacy/pipeline/defaults/parser_defaults.cfg index 9cbb6eadb..6fe0fd7cb 100644 --- a/spacy/pipeline/defaults/parser_defaults.cfg +++ b/spacy/pipeline/defaults/parser_defaults.cfg @@ -13,3 +13,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/senter_defaults.cfg b/spacy/pipeline/defaults/senter_defaults.cfg index ffa2c6ce2..304e42b01 100644 --- a/spacy/pipeline/defaults/senter_defaults.cfg +++ b/spacy/pipeline/defaults/senter_defaults.cfg @@ -10,3 +10,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 2 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/simple_ner_defaults.cfg b/spacy/pipeline/defaults/simple_ner_defaults.cfg index 4e3b640df..7f206a636 100644 --- a/spacy/pipeline/defaults/simple_ner_defaults.cfg +++ b/spacy/pipeline/defaults/simple_ner_defaults.cfg @@ -10,3 +10,4 @@ embed_size = 7000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/tagger_defaults.cfg b/spacy/pipeline/defaults/tagger_defaults.cfg index 5aea80a32..f26c5f099 100644 --- a/spacy/pipeline/defaults/tagger_defaults.cfg +++ b/spacy/pipeline/defaults/tagger_defaults.cfg @@ -10,3 +10,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/textcat_cnn_defaults.cfg b/spacy/pipeline/defaults/textcat_cnn_defaults.cfg index cea1bfe54..91f3a1742 100644 --- a/spacy/pipeline/defaults/textcat_cnn_defaults.cfg +++ b/spacy/pipeline/defaults/textcat_cnn_defaults.cfg @@ -11,3 +11,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/textcat_defaults.cfg b/spacy/pipeline/defaults/textcat_defaults.cfg index 9477b2995..e5817de4a 100644 --- a/spacy/pipeline/defaults/textcat_defaults.cfg +++ b/spacy/pipeline/defaults/textcat_defaults.cfg @@ -7,3 +7,4 @@ conv_depth = 2 embed_size = 2000 window_size = 1 ngram_size = 1 +dropout = null \ No newline at end of file diff --git a/spacy/pipeline/defaults/tok2vec_defaults.cfg b/spacy/pipeline/defaults/tok2vec_defaults.cfg index 9475d4aab..36bf0c3da 100644 --- a/spacy/pipeline/defaults/tok2vec_defaults.cfg +++ b/spacy/pipeline/defaults/tok2vec_defaults.cfg @@ -7,3 +7,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null \ No newline at end of file diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 725a4fd69..179659597 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -123,9 +123,9 @@ def test_overfitting_IO(): {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}, - {"@architectures": "spacy.TextCat.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2}, - {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1}, - {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3}, + {"@architectures": "spacy.TextCat.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None}, + {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None}, + {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None}, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": default_tok2vec(), "exclusive_classes": True}, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": default_tok2vec(), "exclusive_classes": False}, ], diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index ba63adfa4..870a980f2 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -24,6 +24,7 @@ window_size = 1 embed_size = 2000 maxout_pieces = 3 subword_features = true +dropout = null [nlp.pipeline.tagger] factory = "tagger" @@ -53,6 +54,7 @@ embed_size = 5555 window_size = 1 maxout_pieces = 7 subword_features = false +dropout = null """ @@ -70,6 +72,7 @@ def my_parser(): nC=8, conv_depth=2, bilstm_depth=0, + dropout=None, ) parser = build_tb_parser_model( tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5 diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index 9c2e9004b..ee1f9dead 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -15,7 +15,7 @@ def test_empty_doc(): vocab = Vocab() doc = Doc(vocab, words=[]) # TODO: fix tok2vec arguments - tok2vec = build_Tok2Vec_model(width, embed_size) + tok2vec = build_Tok2Vec_model(width, embed_size, dropout=None) vectors, backprop = tok2vec.begin_update([doc]) assert len(vectors) == 1 assert vectors[0].shape == (0, width) @@ -38,6 +38,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): char_embed=False, nM=64, nC=8, + dropout=None, ) tok2vec.initialize() vectors, backprop = tok2vec.begin_update(batch) @@ -50,14 +51,14 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): @pytest.mark.parametrize( "tok2vec_config", [ - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, - {"width": 8, "embed_size": 100, "char_embed": True, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, - {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, - {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": True, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, ], ) # fmt: on From 109bbdab98735def2f106d113094fc880d2b2382 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jun 2020 11:53:59 +0200 Subject: [PATCH 05/13] update config files with separate dropout for Tok2Vec layer --- examples/experiments/onto-joint/defaults.cfg | 1 + examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg | 1 + examples/experiments/ptb-joint-pos-dep/defaults.cfg | 1 + examples/experiments/tok2vec-ner/charembed_tok2vec.cfg | 1 + examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg | 1 + 5 files changed, 5 insertions(+) diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg index 0fdbc5cf5..6c3a21f4b 100644 --- a/examples/experiments/onto-joint/defaults.cfg +++ b/examples/experiments/onto-joint/defaults.cfg @@ -114,3 +114,4 @@ window_size = 1 embed_size = 10000 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg index fdd4139f8..52faad9ec 100644 --- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -66,3 +66,4 @@ depth = 4 embed_size = 2000 subword_features = true maxout_pieces = 3 +dropout = null \ No newline at end of file diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index 5b369d782..c305c015c 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -67,3 +67,4 @@ window_size = 1 embed_size = 2000 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg index 8e5c3a276..eca6a22fa 100644 --- a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg +++ b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg @@ -37,6 +37,7 @@ nM = 64 nC = 8 rows = 2000 columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"] +dropout = null [nlp.pipeline.tok2vec.model.extract.features] @architectures = "spacy.Doc2Feats.v1" diff --git a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg index 149b8ea66..a5fa32b18 100644 --- a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg +++ b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg @@ -45,3 +45,4 @@ maxout_pieces = 3 window_size = 1 subword_features = true pretrained_vectors = null +dropout = null From ffe0451d0972ec209556dc7aad356deca1cbe0a7 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jun 2020 14:45:00 +0200 Subject: [PATCH 06/13] pretrain from config --- examples/experiments/onto-joint/pretrain.cfg | 144 +++++++++++++++ spacy/_ml.py | 0 spacy/cli/pretrain.py | 179 +++++++------------ spacy/errors.py | 2 - spacy/ml/models/multi_task.py | 84 ++++++++- 5 files changed, 286 insertions(+), 123 deletions(-) create mode 100644 examples/experiments/onto-joint/pretrain.cfg delete mode 100644 spacy/_ml.py diff --git a/examples/experiments/onto-joint/pretrain.cfg b/examples/experiments/onto-joint/pretrain.cfg new file mode 100644 index 000000000..6a41cc677 --- /dev/null +++ b/examples/experiments/onto-joint/pretrain.cfg @@ -0,0 +1,144 @@ +# Training hyper-parameters and additional features. +[training] +# Whether to train on sequences with 'gold standard' sentence boundaries +# and tokens. If you set this to true, take care to ensure your run-time +# data is passed in sentence-by-sentence via some prior preprocessing. +gold_preproc = false +# Limitations on training document length or number of examples. +max_length = 0 +limit = 0 +# Data augmentation +orth_variant_level = 0.0 +dropout = 0.1 +# Controls early-stopping. 0 or -1 mean unlimited. +patience = 1600 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 400 +# Other settings +seed = 0 +accumulate_gradient = 1 +use_pytorch_for_gpu_memory = false +# Control how scores are printed and checkpoints are evaluated. +scores = ["speed", "tags_acc", "uas", "las", "ents_f"] +score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2} +# These settings are invalid for the transformer models. +init_tok2vec = null +vectors = null +discard_oversize = false + +[training.batch_size] +@schedules = "compounding.v1" +start = 1000 +stop = 1000 +compound = 1.001 + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 1e-8 +learn_rate = 0.001 + +[pretraining] +max_epochs = 100 +min_length = 5 +max_length = 500 +dropout = 0.2 +n_save_every = null +batch_size = 3000 + +[pretraining.model] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = ${nlp:vectors} +width = 256 +depth = 6 +window_size = 1 +embed_size = 2000 +maxout_pieces = 3 +subword_features = true +dropout = null + +[pretraining.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 1e-8 +learn_rate = 0.001 + +[pretraining.loss_func] +@losses = "CosineDistance.v1" + +[nlp] +lang = "en" +vectors = ${training:vectors} + +[nlp.pipeline.tok2vec] +factory = "tok2vec" + +[nlp.pipeline.senter] +factory = "senter" + +[nlp.pipeline.ner] +factory = "ner" + +[nlp.pipeline.tagger] +factory = "tagger" + +[nlp.pipeline.parser] +factory = "parser" + +[nlp.pipeline.senter.model] +@architectures = "spacy.Tagger.v1" + +[nlp.pipeline.senter.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.tagger.model] +@architectures = "spacy.Tagger.v1" + +[nlp.pipeline.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.parser.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 8 +hidden_width = 128 +maxout_pieces = 3 +use_upper = false + +[nlp.pipeline.parser.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.ner.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 3 +hidden_width = 128 +maxout_pieces = 3 +use_upper = false + +[nlp.pipeline.ner.model.tok2vec] +@architectures = "spacy.Tok2VecTensors.v1" +width = ${nlp.pipeline.tok2vec.model:width} + +[nlp.pipeline.tok2vec.model] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = ${nlp:vectors} +width = 256 +depth = 6 +window_size = 1 +embed_size = 10000 +maxout_pieces = 3 +subword_features = true +dropout = null diff --git a/spacy/_ml.py b/spacy/_ml.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index b2e3229ee..0022a0d07 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -3,48 +3,36 @@ import numpy import time import re from collections import Counter +import plac from pathlib import Path -from thinc.api import Linear, Maxout, chain, list2array, prefer_gpu -from thinc.api import CosineDistance, L2Distance +from thinc.api import Linear, Maxout, chain, list2array from wasabi import msg import srsly +from thinc.api import use_pytorch_for_gpu_memory -from ..gold import Example from ..errors import Errors from ..ml.models.multi_task import build_masked_language_model from ..tokens import Doc from ..attrs import ID, HEAD -from ..ml.models.tok2vec import build_Tok2Vec_model from .. import util -from ..util import create_default_optimizer -from .train import _load_pretrained_tok2vec +from ..gold import Example -def pretrain( +@plac.annotations( # fmt: off - texts_loc: ("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str), - vectors_model: ("Name or path to spaCy model with vectors to learn from", "positional", None, str), - output_dir: ("Directory to write models to on each epoch", "positional", None, str), - width: ("Width of CNN layers", "option", "cw", int) = 96, - conv_depth: ("Depth of CNN layers", "option", "cd", int) = 4, - bilstm_depth: ("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int) = 0, - cnn_pieces: ("Maxout size for CNN layers. 1 for Mish", "option", "cP", int) = 3, - sa_depth: ("Depth of self-attention layers", "option", "sa", int) = 0, - use_chars: ("Whether to use character-based embedding", "flag", "chr", bool) = False, - cnn_window: ("Window size for CNN layers", "option", "cW", int) = 1, - embed_rows: ("Number of embedding rows", "option", "er", int) = 2000, - loss_func: ("Loss function to use for the objective. Either 'L2' or 'cosine'", "option", "L", str) = "cosine", - use_vectors: ("Whether to use the static vectors as input features", "flag", "uv") = False, - dropout: ("Dropout rate", "option", "d", float) = 0.2, - n_iter: ("Number of iterations to pretrain", "option", "i", int) = 1000, - batch_size: ("Number of words per training batch", "option", "bs", int) = 3000, - max_length: ("Max words per example. Longer examples are discarded", "option", "xw", int) = 500, - min_length: ("Min words per example. Shorter examples are discarded", "option", "nw", int) = 5, - seed: ("Seed for random number generators", "option", "s", int) = 0, - n_save_every: ("Save model every X batches.", "option", "se", int) = None, - init_tok2vec: ("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None, - epoch_start: ("The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.", "option", "es", int) = None, + texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str), + vectors_model=("Name or path to spaCy model with vectors to learn from", "positional", None, str), + config_path=("Path to config file", "positional", None, Path), + output_dir=("Directory to write models to on each epoch", "positional", None, Path), + use_gpu=("Use GPU", "option", "g", int), # fmt: on +) +def pretrain( + texts_loc, + vectors_model, + config_path, + output_dir, + use_gpu=-1, ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, @@ -58,23 +46,24 @@ def pretrain( However, it's still quite experimental, so your mileage may vary. To load the weights back in during 'spacy train', you need to ensure - all settings are the same between pretraining and training. The API and - errors around this need some improvement. + all settings are the same between pretraining and training. Ideally, + this is done by using the same config file for both commands. """ - config = dict(locals()) - for key in config: - if isinstance(config[key], Path): - config[key] = str(config[key]) - util.fix_random_seed(seed) + if not config_path or not config_path.exists(): + msg.fail("Config file not found", config_path, exits=1) - has_gpu = prefer_gpu() - if has_gpu: - import torch + if use_gpu >= 0: + msg.info("Using GPU") + util.use_gpu(use_gpu) + else: + msg.info("Using CPU") - torch.set_default_tensor_type("torch.cuda.FloatTensor") - msg.info("Using GPU" if has_gpu else "Not using GPU") + msg.info(f"Loading config from: {config_path}") + config = util.load_config(config_path, create_objects=False) + util.fix_random_seed(config["training"]["seed"]) + if config["training"]["use_pytorch_for_gpu_memory"]: + use_pytorch_for_gpu_memory() - output_dir = Path(output_dir) if output_dir.exists() and [p for p in output_dir.iterdir()]: msg.warn( "Output directory is not empty", @@ -85,7 +74,10 @@ def pretrain( output_dir.mkdir() msg.good(f"Created output directory: {output_dir}") srsly.write_json(output_dir / "config.json", config) - msg.good("Saved settings to config.json") + msg.good("Saved config file in the output directory") + + config = util.load_config(config_path, create_objects=True) + pretrain_config = config["pretraining"] # Load texts from file or stdin if texts_loc != "-": # reading from a file @@ -105,49 +97,11 @@ def pretrain( with msg.loading(f"Loading model '{vectors_model}'..."): nlp = util.load_model(vectors_model) msg.good(f"Loaded model '{vectors_model}'") - pretrained_vectors = None if not use_vectors else nlp.vocab.vectors - model = create_pretraining_model( - nlp, - # TODO: replace with config - build_Tok2Vec_model( - width, - embed_rows, - conv_depth=conv_depth, - pretrained_vectors=pretrained_vectors, - bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental. - subword_features=not use_chars, # Set to False for Chinese etc - maxout_pieces=cnn_pieces, # If set to 1, use Mish activation. - window_size=1, - char_embed=False, - nM=64, - nC=8, - ), - ) - # Load in pretrained weights - if init_tok2vec is not None: - components = _load_pretrained_tok2vec(nlp, init_tok2vec) - msg.text(f"Loaded pretrained tok2vec for: {components}") - # Parse the epoch number from the given weight file - model_name = re.search(r"model\d+\.bin", str(init_tok2vec)) - if model_name: - # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' - epoch_start = int(model_name.group(0)[5:][:-4]) + 1 - else: - if not epoch_start: - msg.fail( - "You have to use the --epoch-start argument when using a renamed weight file for --init-tok2vec", - exits=True, - ) - elif epoch_start < 0: - msg.fail( - f"The argument --epoch-start has to be greater or equal to 0. {epoch_start} is invalid", - exits=True, - ) - else: - # Without '--init-tok2vec' the '--epoch-start' argument is ignored - epoch_start = 0 + tok2vec = pretrain_config["model"] + model = create_pretraining_model(nlp, tok2vec) + optimizer = pretrain_config["optimizer"] - optimizer = create_default_optimizer() + epoch_start = 0 # TODO tracker = ProgressTracker(frequency=10000) msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_start}") row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} @@ -168,28 +122,25 @@ def pretrain( file_.write(srsly.json_dumps(log) + "\n") skip_counter = 0 - for epoch in range(epoch_start, n_iter + epoch_start): - for batch_id, batch in enumerate( - util.minibatch_by_words( - (Example(doc=text) for text in texts), size=batch_size - ) - ): + loss_func = pretrain_config["loss_func"] + for epoch in range(epoch_start, pretrain_config["max_epochs"]): + examples = [Example(doc=text) for text in texts] + batches = util.minibatch_by_words(examples, size=pretrain_config["batch_size"]) + for batch_id, batch in enumerate(batches): docs, count = make_docs( nlp, - [text for (text, _) in batch], - max_length=max_length, - min_length=min_length, + [ex.doc for ex in batch], + max_length=pretrain_config["max_length"], + min_length=pretrain_config["min_length"], ) skip_counter += count - loss = make_update( - model, docs, optimizer, objective=loss_func, drop=dropout - ) + loss = make_update(model, docs, optimizer, distance=loss_func) progress = tracker.update(epoch, loss, docs) if progress: msg.row(progress, **row_settings) if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7: break - if n_save_every and (batch_id % n_save_every == 0): + if pretrain_config["n_save_every"] and (batch_id % pretrain_config["n_save_every"] == 0): _save_model(epoch, is_temp=True) _save_model(epoch) tracker.epoch_loss = 0.0 @@ -201,17 +152,17 @@ def pretrain( msg.good("Successfully finished pretrain") -def make_update(model, docs, optimizer, drop=0.0, objective="L2"): +def make_update(model, docs, optimizer, distance): """Perform an update over a single batch of documents. docs (iterable): A batch of `Doc` objects. - drop (float): The dropout rate. optimizer (callable): An optimizer. RETURNS loss: A float for the loss. """ - predictions, backprop = model.begin_update(docs, drop=drop) - loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective) - backprop(gradients, sgd=optimizer) + predictions, backprop = model.begin_update(docs) + loss, gradients = get_vectors_loss(model.ops, docs, predictions, distance) + backprop(gradients) + model.finish_update(optimizer) # Don't want to return a cupy object here # The gradients are modified in-place by the BERT MLM, # so we get an accurate loss @@ -243,12 +194,12 @@ def make_docs(nlp, batch, min_length, max_length): heads = numpy.asarray(heads, dtype="uint64") heads = heads.reshape((len(doc), 1)) doc = doc.from_array([HEAD], heads) - if len(doc) >= min_length and len(doc) < max_length: + if min_length <= len(doc) < max_length: docs.append(doc) return docs, skip_count -def get_vectors_loss(ops, docs, prediction, objective="L2"): +def get_vectors_loss(ops, docs, prediction, distance): """Compute a mean-squared error loss between the documents' vectors and the prediction. @@ -262,13 +213,6 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"): # and look them up all at once. This prevents data copying. ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) target = docs[0].vocab.vectors.data[ids] - # TODO: this code originally didn't normalize, but shouldn't normalize=True ? - if objective == "L2": - distance = L2Distance(normalize=False) - elif objective == "cosine": - distance = CosineDistance(normalize=False) - else: - raise ValueError(Errors.E142.format(loss_func=objective)) d_target, loss = distance(prediction, target) return loss, d_target @@ -281,7 +225,7 @@ def create_pretraining_model(nlp, tok2vec): """ output_size = nlp.vocab.vectors.data.shape[1] output_layer = chain( - Maxout(300, pieces=3, normalize=True, dropout=0.0), Linear(output_size) + Maxout(nO=300, nP=3, normalize=True, dropout=0.0), Linear(output_size) ) # This is annoying, but the parser etc have the flatten step after # the tok2vec. To load the weights in cleanly, we need to match @@ -289,11 +233,12 @@ def create_pretraining_model(nlp, tok2vec): # "tok2vec" has to be the same set of processes as what the components do. tok2vec = chain(tok2vec, list2array()) model = chain(tok2vec, output_layer) - model = build_masked_language_model(nlp.vocab, model) - model.set_ref("tok2vec", tok2vec) - model.set_ref("output_layer", output_layer) model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) - return model + mlm_model = build_masked_language_model(nlp.vocab, model) + mlm_model.set_ref("tok2vec", tok2vec) + mlm_model.set_ref("output_layer", output_layer) + mlm_model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")]) + return mlm_model class ProgressTracker(object): diff --git a/spacy/errors.py b/spacy/errors.py index 852c55225..96b323ef5 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -441,8 +441,6 @@ class Errors(object): "should be of equal length.") E141 = ("Entity vectors should be of length {required} instead of the " "provided {found}.") - E142 = ("Unsupported loss_function '{loss_func}'. Use either 'L2' or " - "'cosine'.") E143 = ("Labels for component '{name}' not initialized. Did you forget to " "call add_label()?") E144 = ("Could not find parameter `{param}` when building the entity " diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 1c193df82..970d31899 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -1,4 +1,6 @@ -from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init +import numpy + +from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model def build_multi_task_model(n_tags, tok2vec=None, token_vector_width=96): @@ -24,6 +26,80 @@ def build_cloze_multi_task_model(vocab, tok2vec): return model -def build_masked_language_model(*args, **kwargs): - # TODO cf https://github.com/explosion/spaCy/blob/2c107f02a4d60bda2440db0aad1a88cbbf4fb52d/spacy/_ml.py#L828 - raise NotImplementedError +def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15): + """Convert a model into a BERT-style masked language model""" + + random_words = _RandomWords(vocab) + + def mlm_forward(model, docs, is_train): + mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob) + mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) + output, backprop = model.get_ref("wrapped-model").begin_update(docs) # drop=drop + + def mlm_backward(d_output): + d_output *= 1 - mask + return backprop(d_output) + + return output, mlm_backward + + mlm_model = Model("masked-language-model", mlm_forward, layers=[wrapped_model]) + mlm_model.set_ref("wrapped-model", wrapped_model) + + return mlm_model + + +class _RandomWords(object): + def __init__(self, vocab): + self.words = [lex.text for lex in vocab if lex.prob != 0.0] + self.probs = [lex.prob for lex in vocab if lex.prob != 0.0] + self.words = self.words[:10000] + self.probs = self.probs[:10000] + self.probs = numpy.exp(numpy.array(self.probs, dtype="f")) + self.probs /= self.probs.sum() + self._cache = [] + + def next(self): + if not self._cache: + self._cache.extend( + numpy.random.choice(len(self.words), 10000, p=self.probs) + ) + index = self._cache.pop() + return self.words[index] + + +def _apply_mask(docs, random_words, mask_prob=0.15): + # This needs to be here to avoid circular imports + from ...tokens import Doc + + N = sum(len(doc) for doc in docs) + mask = numpy.random.uniform(0.0, 1.0, (N,)) + mask = mask >= mask_prob + i = 0 + masked_docs = [] + for doc in docs: + words = [] + for token in doc: + if not mask[i]: + word = _replace_word(token.text, random_words) + else: + word = token.text + words.append(word) + i += 1 + spaces = [bool(w.whitespace_) for w in doc] + # NB: If you change this implementation to instead modify + # the docs in place, take care that the IDs reflect the original + # words. Currently we use the original docs to make the vectors + # for the target, so we don't lose the original tokens. But if + # you modified the docs in place here, you would. + masked_docs.append(Doc(doc.vocab, words=words, spaces=spaces)) + return mask, masked_docs + + +def _replace_word(word, random_words, mask="[MASK]"): + roll = numpy.random.random() + if roll < 0.8: + return mask + elif roll < 0.9: + return random_words.next() + else: + return word \ No newline at end of file From ddf8244df954972d81b22449c149d1b79964b2cf Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jun 2020 14:52:54 +0200 Subject: [PATCH 07/13] add normalize option to distance metric --- examples/experiments/onto-joint/pretrain.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/experiments/onto-joint/pretrain.cfg b/examples/experiments/onto-joint/pretrain.cfg index 6a41cc677..87501fb16 100644 --- a/examples/experiments/onto-joint/pretrain.cfg +++ b/examples/experiments/onto-joint/pretrain.cfg @@ -76,6 +76,7 @@ learn_rate = 0.001 [pretraining.loss_func] @losses = "CosineDistance.v1" +normalize = true [nlp] lang = "en" From 4ed6278663c9482e14b549b2079f02cc186bc078 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jun 2020 19:32:40 +0200 Subject: [PATCH 08/13] small fixes to pretrain config, init_tok2vec TODO --- examples/experiments/onto-joint/pretrain.cfg | 6 +++- .../ptb-joint-pos-dep/bilstm_tok2vec.cfg | 2 +- spacy/cli/pretrain.py | 34 ++++++++++++++++--- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/examples/experiments/onto-joint/pretrain.cfg b/examples/experiments/onto-joint/pretrain.cfg index 87501fb16..f1de3eab9 100644 --- a/examples/experiments/onto-joint/pretrain.cfg +++ b/examples/experiments/onto-joint/pretrain.cfg @@ -45,12 +45,16 @@ eps = 1e-8 learn_rate = 0.001 [pretraining] -max_epochs = 100 +max_epochs = 1000 +start_epoch = 0 min_length = 5 max_length = 500 dropout = 0.2 n_save_every = null batch_size = 3000 +seed = ${training:seed} +use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory} +init_tok2vec = null [pretraining.model] @architectures = "spacy.HashEmbedCNN.v1" diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg index 52faad9ec..acbcc8d41 100644 --- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -66,4 +66,4 @@ depth = 4 embed_size = 2000 subword_features = true maxout_pieces = 3 -dropout = null \ No newline at end of file +dropout = null diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 0022a0d07..d6f4d484c 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -16,14 +16,15 @@ from ..tokens import Doc from ..attrs import ID, HEAD from .. import util from ..gold import Example +from .deprecated_pretrain import _load_pretrained_tok2vec # TODO @plac.annotations( # fmt: off texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str), vectors_model=("Name or path to spaCy model with vectors to learn from", "positional", None, str), - config_path=("Path to config file", "positional", None, Path), output_dir=("Directory to write models to on each epoch", "positional", None, Path), + config_path=("Path to config file", "positional", None, Path), use_gpu=("Use GPU", "option", "g", int), # fmt: on ) @@ -60,8 +61,8 @@ def pretrain( msg.info(f"Loading config from: {config_path}") config = util.load_config(config_path, create_objects=False) - util.fix_random_seed(config["training"]["seed"]) - if config["training"]["use_pytorch_for_gpu_memory"]: + util.fix_random_seed(config["pretraining"]["seed"]) + if config["pretraining"]["use_pytorch_for_gpu_memory"]: use_pytorch_for_gpu_memory() if output_dir.exists() and [p for p in output_dir.iterdir()]: @@ -100,8 +101,33 @@ def pretrain( tok2vec = pretrain_config["model"] model = create_pretraining_model(nlp, tok2vec) optimizer = pretrain_config["optimizer"] + init_tok2vec = pretrain_config["init_tok2vec"] + epoch_start = pretrain_config["epoch_start"] + + # Load in pretrained weights - TODO test + if init_tok2vec is not None: + components = _load_pretrained_tok2vec(nlp, init_tok2vec) + msg.text(f"Loaded pretrained tok2vec for: {components}") + # Parse the epoch number from the given weight file + model_name = re.search(r"model\d+\.bin", str(init_tok2vec)) + if model_name: + # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' + epoch_start = int(model_name.group(0)[5:][:-4]) + 1 + else: + if not epoch_start: + msg.fail( + "You have to use the epoch_start setting when using a renamed weight file for init_tok2vec", + exits=True, + ) + elif epoch_start < 0: + msg.fail( + f"The setting epoch_start has to be greater or equal to 0. {epoch_start} is invalid", + exits=True, + ) + else: + # Without 'init-tok2vec' the 'epoch_start' setting is ignored + epoch_start = 0 - epoch_start = 0 # TODO tracker = ProgressTracker(frequency=10000) msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_start}") row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} From 07886a3de35f6a9188c6a2963c45e4fbda138004 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jun 2020 22:00:25 +0200 Subject: [PATCH 09/13] rename init_tok2vec to resume --- examples/experiments/onto-joint/pretrain.cfg | 2 - spacy/cli/pretrain.py | 61 ++++++++++++-------- 2 files changed, 37 insertions(+), 26 deletions(-) diff --git a/examples/experiments/onto-joint/pretrain.cfg b/examples/experiments/onto-joint/pretrain.cfg index f1de3eab9..1637cceae 100644 --- a/examples/experiments/onto-joint/pretrain.cfg +++ b/examples/experiments/onto-joint/pretrain.cfg @@ -46,7 +46,6 @@ learn_rate = 0.001 [pretraining] max_epochs = 1000 -start_epoch = 0 min_length = 5 max_length = 500 dropout = 0.2 @@ -54,7 +53,6 @@ n_save_every = null batch_size = 3000 seed = ${training:seed} use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory} -init_tok2vec = null [pretraining.model] @architectures = "spacy.HashEmbedCNN.v1" diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index d6f4d484c..0a04de101 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -16,7 +16,6 @@ from ..tokens import Doc from ..attrs import ID, HEAD from .. import util from ..gold import Example -from .deprecated_pretrain import _load_pretrained_tok2vec # TODO @plac.annotations( @@ -26,7 +25,10 @@ from .deprecated_pretrain import _load_pretrained_tok2vec # TODO output_dir=("Directory to write models to on each epoch", "positional", None, Path), config_path=("Path to config file", "positional", None, Path), use_gpu=("Use GPU", "option", "g", int), - # fmt: on + resume_path=("Path to pretrained weights from which to resume pretraining", "option","r", Path), + epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.","option", "er", int), + +# fmt: on ) def pretrain( texts_loc, @@ -34,6 +36,8 @@ def pretrain( config_path, output_dir, use_gpu=-1, + resume_path=None, + epoch_resume=None, ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, @@ -66,11 +70,19 @@ def pretrain( use_pytorch_for_gpu_memory() if output_dir.exists() and [p for p in output_dir.iterdir()]: - msg.warn( - "Output directory is not empty", - "It is better to use an empty directory or refer to a new output path, " - "then the new directory will be created for you.", - ) + if resume_path: + msg.warn( + "Output directory is not empty. ", + "If you're resuming a run from a previous " + "model, the old models for the consecutive epochs will be overwritten " + "with the new ones.", + ) + else: + msg.warn( + "Output directory is not empty. ", + "It is better to use an empty directory or refer to a new output path, " + "then the new directory will be created for you.", + ) if not output_dir.exists(): output_dir.mkdir() msg.good(f"Created output directory: {output_dir}") @@ -92,7 +104,7 @@ def pretrain( msg.good("Loaded input texts") random.shuffle(texts) else: # reading from stdin - msg.text("Reading input text from stdin...") + msg.info("Reading input text from stdin...") texts = srsly.read_jsonl("-") with msg.loading(f"Loading model '{vectors_model}'..."): @@ -101,35 +113,36 @@ def pretrain( tok2vec = pretrain_config["model"] model = create_pretraining_model(nlp, tok2vec) optimizer = pretrain_config["optimizer"] - init_tok2vec = pretrain_config["init_tok2vec"] - epoch_start = pretrain_config["epoch_start"] - # Load in pretrained weights - TODO test - if init_tok2vec is not None: - components = _load_pretrained_tok2vec(nlp, init_tok2vec) - msg.text(f"Loaded pretrained tok2vec for: {components}") + # Load in pretrained weights to resume from + if resume_path is not None: + msg.info(f"Resume training tok2vec from: {resume_path}") + with resume_path.open("rb") as file_: + weights_data = file_.read() + model.get_ref("tok2vec").from_bytes(weights_data) # Parse the epoch number from the given weight file - model_name = re.search(r"model\d+\.bin", str(init_tok2vec)) + model_name = re.search(r"model\d+\.bin", str(resume_path)) if model_name: # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' - epoch_start = int(model_name.group(0)[5:][:-4]) + 1 + epoch_resume = int(model_name.group(0)[5:][:-4]) + 1 + msg.info(f"Resuming from epoch: {epoch_resume}") else: - if not epoch_start: + if not epoch_resume: msg.fail( - "You have to use the epoch_start setting when using a renamed weight file for init_tok2vec", + "You have to use the --epoch_resume setting when using a renamed weight file for --resume_path", exits=True, ) - elif epoch_start < 0: + elif epoch_resume < 0: msg.fail( - f"The setting epoch_start has to be greater or equal to 0. {epoch_start} is invalid", + f"The setting --epoch_resume has to be greater or equal to 0. {epoch_resume} is invalid", exits=True, ) else: - # Without 'init-tok2vec' the 'epoch_start' setting is ignored - epoch_start = 0 + # Without 'resume_path' the 'epoch_resume' setting is ignored + epoch_resume = 0 tracker = ProgressTracker(frequency=10000) - msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_start}") + msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}") row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) @@ -149,7 +162,7 @@ def pretrain( skip_counter = 0 loss_func = pretrain_config["loss_func"] - for epoch in range(epoch_start, pretrain_config["max_epochs"]): + for epoch in range(epoch_resume, pretrain_config["max_epochs"]): examples = [Example(doc=text) for text in texts] batches = util.minibatch_by_words(examples, size=pretrain_config["batch_size"]) for batch_id, batch in enumerate(batches): From 1775f54a2627ccad23b81d64e74c19777f71057f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jun 2020 22:17:02 +0200 Subject: [PATCH 10/13] small little fixes --- spacy/cli/pretrain.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 0a04de101..96564b98b 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -5,10 +5,9 @@ import re from collections import Counter import plac from pathlib import Path -from thinc.api import Linear, Maxout, chain, list2array +from thinc.api import Linear, Maxout, chain, list2array, use_pytorch_for_gpu_memory from wasabi import msg import srsly -from thinc.api import use_pytorch_for_gpu_memory from ..errors import Errors from ..ml.models.multi_task import build_masked_language_model @@ -73,8 +72,8 @@ def pretrain( if resume_path: msg.warn( "Output directory is not empty. ", - "If you're resuming a run from a previous " - "model, the old models for the consecutive epochs will be overwritten " + "If you're resuming a run from a previous model in this directory, " + "the old models for the consecutive epochs will be overwritten " "with the new ones.", ) else: @@ -129,16 +128,18 @@ def pretrain( else: if not epoch_resume: msg.fail( - "You have to use the --epoch_resume setting when using a renamed weight file for --resume_path", + "You have to use the --epoch-resume setting when using a renamed weight file for --resume-path", exits=True, ) elif epoch_resume < 0: msg.fail( - f"The setting --epoch_resume has to be greater or equal to 0. {epoch_resume} is invalid", + f"The argument --epoch-resume has to be greater or equal to 0. {epoch_resume} is invalid", exits=True, ) + else: + msg.info(f"Resuming from epoch: {epoch_resume}") else: - # Without 'resume_path' the 'epoch_resume' setting is ignored + # Without '--resume-path' the '--epoch-resume' argument is ignored epoch_resume = 0 tracker = ProgressTracker(frequency=10000) From 6b027d76893de1b535f17a9b2848aba93bb2bb41 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 4 Jun 2020 15:49:23 +0200 Subject: [PATCH 11/13] remove duplicate model definition of tok2vec layer --- examples/experiments/onto-joint/pretrain.cfg | 12 +----------- spacy/cli/pretrain.py | 5 ++++- spacy/ml/models/multi_task.py | 2 +- 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/examples/experiments/onto-joint/pretrain.cfg b/examples/experiments/onto-joint/pretrain.cfg index 1637cceae..4f1898d69 100644 --- a/examples/experiments/onto-joint/pretrain.cfg +++ b/examples/experiments/onto-joint/pretrain.cfg @@ -53,17 +53,7 @@ n_save_every = null batch_size = 3000 seed = ${training:seed} use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory} - -[pretraining.model] -@architectures = "spacy.HashEmbedCNN.v1" -pretrained_vectors = ${nlp:vectors} -width = 256 -depth = 6 -window_size = 1 -embed_size = 2000 -maxout_pieces = 3 -subword_features = true -dropout = null +tok2vec_model = "nlp.pipeline.tok2vec.model" [pretraining.optimizer] @optimizers = "Adam.v1" diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 96564b98b..921eb38ab 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -109,7 +109,10 @@ def pretrain( with msg.loading(f"Loading model '{vectors_model}'..."): nlp = util.load_model(vectors_model) msg.good(f"Loaded model '{vectors_model}'") - tok2vec = pretrain_config["model"] + tok2vec_path = pretrain_config["tok2vec_model"] + tok2vec = config + for subpath in tok2vec_path.split("."): + tok2vec = tok2vec.get(subpath) model = create_pretraining_model(nlp, tok2vec) optimizer = pretrain_config["optimizer"] diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 970d31899..8000d1aff 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -102,4 +102,4 @@ def _replace_word(word, random_words, mask="[MASK]"): elif roll < 0.9: return random_words.next() else: - return word \ No newline at end of file + return word From 776d4f11909d796963068a9a931fdca8b71a8ccc Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 4 Jun 2020 16:07:30 +0200 Subject: [PATCH 12/13] cleanup --- spacy/cli/train_from_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py index 9cdc3bf2f..a6d0a0abc 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train_from_config.py @@ -76,7 +76,7 @@ maxout_pieces = 3 subword_features = true """ -# TODO: REMOVE ? + class PipelineComponent(BaseModel): factory: str model: Model @@ -84,7 +84,7 @@ class PipelineComponent(BaseModel): class Config: arbitrary_types_allowed = True -# TODO: REMOVE ? + class ConfigSchema(BaseModel): optimizer: Optional["Optimizer"] From 3ade455fd35eb14bf59f4d0276c1699323a947a3 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 4 Jun 2020 16:09:55 +0200 Subject: [PATCH 13/13] formatting --- spacy/cli/pretrain.py | 7 ++++--- spacy/ml/__init__.py | 2 +- spacy/pipeline/defaults/entity_linker_defaults.cfg | 2 +- spacy/pipeline/defaults/morphologizer_defaults.cfg | 2 +- spacy/pipeline/defaults/textcat_defaults.cfg | 2 +- spacy/pipeline/defaults/tok2vec_defaults.cfg | 2 +- 6 files changed, 9 insertions(+), 8 deletions(-) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 921eb38ab..d37426b5a 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -26,8 +26,7 @@ from ..gold import Example use_gpu=("Use GPU", "option", "g", int), resume_path=("Path to pretrained weights from which to resume pretraining", "option","r", Path), epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.","option", "er", int), - -# fmt: on + # fmt: on ) def pretrain( texts_loc, @@ -183,7 +182,9 @@ def pretrain( msg.row(progress, **row_settings) if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7: break - if pretrain_config["n_save_every"] and (batch_id % pretrain_config["n_save_every"] == 0): + if pretrain_config["n_save_every"] and ( + batch_id % pretrain_config["n_save_every"] == 0 + ): _save_model(epoch, is_temp=True) _save_model(epoch) tracker.epoch_loss = 0.0 diff --git a/spacy/ml/__init__.py b/spacy/ml/__init__.py index cf4f59d6c..aed4fa323 100644 --- a/spacy/ml/__init__.py +++ b/spacy/ml/__init__.py @@ -1 +1 @@ -from .models import * \ No newline at end of file +from .models import * diff --git a/spacy/pipeline/defaults/entity_linker_defaults.cfg b/spacy/pipeline/defaults/entity_linker_defaults.cfg index 26a294f37..8dddf9e7b 100644 --- a/spacy/pipeline/defaults/entity_linker_defaults.cfg +++ b/spacy/pipeline/defaults/entity_linker_defaults.cfg @@ -10,4 +10,4 @@ embed_size = 300 window_size = 1 maxout_pieces = 3 subword_features = true -dropout = null \ No newline at end of file +dropout = null diff --git a/spacy/pipeline/defaults/morphologizer_defaults.cfg b/spacy/pipeline/defaults/morphologizer_defaults.cfg index c4452c689..6ee053a08 100644 --- a/spacy/pipeline/defaults/morphologizer_defaults.cfg +++ b/spacy/pipeline/defaults/morphologizer_defaults.cfg @@ -11,4 +11,4 @@ window_size = 1 maxout_pieces = 3 nM = 64 nC = 8 -dropout = null \ No newline at end of file +dropout = null diff --git a/spacy/pipeline/defaults/textcat_defaults.cfg b/spacy/pipeline/defaults/textcat_defaults.cfg index e5817de4a..0981cf77c 100644 --- a/spacy/pipeline/defaults/textcat_defaults.cfg +++ b/spacy/pipeline/defaults/textcat_defaults.cfg @@ -7,4 +7,4 @@ conv_depth = 2 embed_size = 2000 window_size = 1 ngram_size = 1 -dropout = null \ No newline at end of file +dropout = null diff --git a/spacy/pipeline/defaults/tok2vec_defaults.cfg b/spacy/pipeline/defaults/tok2vec_defaults.cfg index 36bf0c3da..d2718eed1 100644 --- a/spacy/pipeline/defaults/tok2vec_defaults.cfg +++ b/spacy/pipeline/defaults/tok2vec_defaults.cfg @@ -7,4 +7,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true -dropout = null \ No newline at end of file +dropout = null