diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index c9f82caa0..0f7226083 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -124,3 +124,5 @@ lookups = null
 tokenizer = {}
 # Arguments for initialize methods of the components (keyed by component)
 components = {}
+before_init = null
+after_init = null
diff --git a/spacy/language.py b/spacy/language.py
index f695ddc9e..91f4b99d4 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1209,6 +1209,9 @@ class Language:
         config = self.config.interpolate()
         # These are the settings provided in the [initialize] block in the config
         I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
+        before_init = I["before_init"]
+        if before_init is not None:
+            before_init(self)
         init_vocab(
             self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
         )
@@ -1240,6 +1243,9 @@ class Language:
             self._optimizer = sgd
         elif self._optimizer is None:
             self._optimizer = self.create_optimizer()
+        after_init = I["after_init"]
+        if after_init is not None:
+            after_init(self)
         return self._optimizer
 
     def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 611eef033..d3f8c339c 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -256,8 +256,14 @@ class Tagger(TrainablePipe):
         DOCS: https://nightly.spacy.io/api/tagger#get_loss
         """
         validate_examples(examples, "Tagger.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, missing_value="")
-        truths = [eg.get_aligned("TAG", as_string=True) for eg in examples]
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
+        # Convert empty tag "" to missing value None so that both misaligned
+        # tokens and tokens with missing annotation have the default missing
+        # value None.
+        truths = []
+        for eg in examples:
+            eg_truths = [tag if tag is not "" else None for tag in eg.get_aligned("TAG", as_string=True)]
+            truths.append(eg_truths)
         d_scores, loss = loss_func(scores, truths)
         if self.model.ops.xp.isnan(loss):
             raise ValueError(Errors.E910.format(name=self.name))
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 3ea611287..d041845f3 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -369,6 +369,8 @@ class ConfigSchemaInit(BaseModel):
     init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
     tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
     components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component")
+    before_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object before initialization")
+    after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization")
     # fmt: on
 
     class Config:
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 6f75c52a4..282961755 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -37,7 +37,16 @@ TRAIN_DATA = [
 ]
 
 PARTIAL_DATA = [
+    # partial annotation
     ("I like green eggs", {"tags": ["", "V", "J", ""]}),
+    # misaligned partial annotation
+    (
+        "He hates green eggs",
+        {
+            "words": ["He", "hate", "s", "green", "eggs"],
+            "tags": ["", "V", "S", "J", ""],
+        },
+    ),
 ]
 
 
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 42654ff7d..6709defb8 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -8,6 +8,7 @@ from spacy.util import registry, load_model_from_config, load_config
 from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
 from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
 from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
+from catalogue import RegistryError
 
 
 from ..util import make_tempdir
@@ -446,3 +447,21 @@ def test_config_validate_literal(parser_config_string):
         nlp.add_pipe("parser", config=config)
     config["model"]["state_type"] = "ner"
     nlp.add_pipe("parser", config=config)
+
+
+def test_config_only_resolve_relevant_blocks():
+    """Test that only the relevant blocks are resolved in the different methods
+    and that invalid blocks are ignored if needed. For instance, the [initialize]
+    shouldn't be resolved at runtime.
+    """
+    nlp = English()
+    config = nlp.config
+    config["training"]["before_to_disk"] = {"@misc": "nonexistent"}
+    config["initialize"]["lookups"] = {"@misc": "nonexistent"}
+    # This shouldn't resolve [training] or [initialize]
+    nlp = load_model_from_config(config, auto_fill=True)
+    # This will raise for nonexistent value
+    with pytest.raises(RegistryError):
+        nlp.initialize()
+    nlp.config["initialize"]["lookups"] = None
+    nlp.initialize()
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 4fbcaee9e..6ffeeadce 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -166,6 +166,8 @@ def test_language_from_config_before_after_init():
     ran_before = False
     ran_after = False
     ran_after_pipeline = False
+    ran_before_init = False
+    ran_after_init = False
 
     @registry.callbacks(f"{name}_before")
     def make_before_creation():
@@ -205,6 +207,26 @@ def test_language_from_config_before_after_init():
 
         return after_pipeline_creation
 
+    @registry.callbacks(f"{name}_before_init")
+    def make_before_init():
+        def before_init(nlp):
+            nonlocal ran_before_init
+            ran_before_init = True
+            nlp.meta["before_init"] = "before"
+            return nlp
+
+        return before_init
+
+    @registry.callbacks(f"{name}_after_init")
+    def make_after_init():
+        def after_init(nlp):
+            nonlocal ran_after_init
+            ran_after_init = True
+            nlp.meta["after_init"] = "after"
+            return nlp
+
+        return after_init
+
     config = {
         "nlp": {
             "pipeline": ["sentencizer"],
@@ -213,14 +235,23 @@ def test_language_from_config_before_after_init():
             "after_pipeline_creation": {"@callbacks": f"{name}_after_pipeline"},
         },
         "components": {"sentencizer": {"factory": "sentencizer"}},
+        "initialize": {
+            "before_init": {"@callbacks": f"{name}_before_init"},
+            "after_init": {"@callbacks": f"{name}_after_init"},
+        },
     }
     nlp = English.from_config(config)
-    assert all([ran_before, ran_after, ran_after_pipeline])
     assert nlp.Defaults.foo == "bar"
     assert nlp.meta["foo"] == "bar"
     assert nlp.meta["bar"] == "baz"
+    assert "before_init" not in nlp.meta
+    assert "after_init" not in nlp.meta
     assert nlp.pipe_names == ["sentencizer"]
     assert nlp("text")
+    nlp.initialize()
+    assert nlp.meta["before_init"] == "before"
+    assert nlp.meta["after_init"] == "after"
+    assert all([ran_before, ran_after, ran_after_pipeline, ran_before_init, ran_after_init])
 
 
 def test_language_from_config_before_after_init_invalid():
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 577c80cb3..fe2d4b18f 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -59,6 +59,19 @@ def train(
     batcher = T["batcher"]
     train_logger = T["logger"]
     before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
+
+    # Helper function to save checkpoints. This is a closure for convenience,
+    # to avoid passing in all the args all the time.
+    def save_checkpoint(is_best):
+        with nlp.use_params(optimizer.averages):
+            before_to_disk(nlp).to_disk(output_path / DIR_MODEL_LAST)
+        if is_best:
+            # Avoid saving twice (saving will be more expensive than
+            # the dir copy)
+            if (output_path / DIR_MODEL_BEST).exists():
+                shutil.rmtree(output_path / DIR_MODEL_BEST)
+            shutil.copytree(output_path / DIR_MODEL_LAST, output_path / DIR_MODEL_BEST)
+
     # Components that shouldn't be updated during training
     frozen_components = T["frozen_components"]
     # Create iterator, which yields out info after each optimization step.
@@ -87,40 +100,31 @@ def train(
             if is_best_checkpoint is not None and output_path is not None:
                 with nlp.select_pipes(disable=frozen_components):
                     update_meta(T, nlp, info)
-                    with nlp.use_params(optimizer.averages):
-                        nlp = before_to_disk(nlp)
-                        nlp.to_disk(output_path / DIR_MODEL_LAST)
-                    if is_best_checkpoint:
-                        with nlp.use_params(optimizer.averages):
-                            nlp.to_disk(output_path / DIR_MODEL_BEST)
-
+                save_checkpoint(is_best_checkpoint)
     except Exception as e:
         if output_path is not None:
-            # We don't want to swallow the traceback if we don't have a
-            # specific error, but we do want to warn that we're trying
-            # to do something here.
             stdout.write(
                 msg.warn(
                     f"Aborting and saving the final best model. "
-                    f"Encountered exception: {str(e)}"
+                    f"Encountered exception: {repr(e)}"
                 )
                 + "\n"
             )
         raise e
     finally:
         finalize_logger()
-        if optimizer.averages:
-            nlp.use_params(optimizer.averages)
-        if output_path is not None:
-            final_model_path = output_path / DIR_MODEL_LAST
-            nlp.to_disk(final_model_path)
-            # This will only run if we don't hit an error
-            stdout.write(
-                msg.good("Saved pipeline to output directory", final_model_path) + "\n"
-            )
-            return (nlp, final_model_path)
-        else:
-            return (nlp, None)
+        save_checkpoint(False)
+    # This will only run if we did't hit an error
+    if optimizer.averages:
+        nlp.use_params(optimizer.averages)
+    if output_path is not None:
+        stdout.write(
+            msg.good("Saved pipeline to output directory", output_path / DIR_MODEL_LAST)
+            + "\n"
+        )
+        return (nlp, output_path / DIR_MODEL_LAST)
+    else:
+        return (nlp, None)
 
 
 def train_while_improving(
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index d0a2ac819..33f647413 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -716,7 +716,7 @@ that we want to classify as being related or not. As these candidate pairs are
 typically formed within one document, this function takes a [`Doc`](/api/doc) as
 input and outputs a `List` of `Span` tuples. For instance, the following
 implementation takes any two entities from the same document, as long as they
-are within a **maximum distance** (in number of tokens) of eachother:
+are within a **maximum distance** (in number of tokens) of each other:
 
 > #### config.cfg (excerpt)
 >
@@ -742,7 +742,7 @@ def create_instances(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]
     return get_candidates
 ```
 
-This function in added to the [`@misc` registry](/api/top-level#registry) so we
+This function is added to the [`@misc` registry](/api/top-level#registry) so we
 can refer to it from the config, and easily swap it out for any other candidate
 generation function.
 
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index f8e502966..ad5bec92a 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -611,14 +611,16 @@ subclass and language data from scratch – it's often enough to make a few smal
 modifications, like adjusting the
 [tokenization rules](/usage/linguistic-features#native-tokenizer-additions) or
 [language defaults](/api/language#defaults) like stop words. The config lets you
-provide three optional **callback functions** that give you access to the
+provide five optional **callback functions** that give you access to the
 language class and `nlp` object at different points of the lifecycle:
 
-| Callback                  | Description                                                                                                                                                                              |
-| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `before_creation`         | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults). |
-| `after_creation`          | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. Useful for modifying the tokenizer.          |
-| `after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components.                                                |
+| Callback                      | Description                                                                                                                                                                                                                |
+| ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `nlp.before_creation`         | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults) aside from the tokenizer settings. |
+| `nlp.after_creation`          | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object.                                                                                |
+| `nlp.after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components.                                                                                  |
+| `initialize.before_init`      | Called before the pipeline components are initialized and receives the `nlp` object for in-place modification. Useful for modifying the tokenizer settings, similar to the v2 base model option.                           |
+| `initialize.after_init`       | Called after the pipeline components are initialized and receives the `nlp` object for in-place modification.                                                                                                              |
 
 The `@spacy.registry.callbacks` decorator lets you register your custom function
 in the `callbacks` [registry](/api/top-level#registry) under a given name. You
@@ -626,8 +628,8 @@ can then reference the function in a config block using the `@callbacks` key. If
 a block contains a key starting with an `@`, it's interpreted as a reference to
 a function. Because you've registered the function, spaCy knows how to create it
 when you reference `"customize_language_data"` in your config. Here's an example
-of a callback that runs before the `nlp` object is created and adds a few custom
-tokenization rules to the defaults:
+of a callback that runs before the `nlp` object is created and adds a custom
+stop word to the defaults:
 
 > #### config.cfg
 >
@@ -643,7 +645,7 @@ import spacy
 @spacy.registry.callbacks("customize_language_data")
 def create_callback():
     def customize_language_data(lang_cls):
-        lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
+        lang_cls.Defaults.stop_words.add("good")
         return lang_cls
 
     return customize_language_data
@@ -674,17 +676,16 @@ we're adding the arguments `extra_stop_words` (a list of strings) and `debug`
 > ```
 
 ```python
-### functions.py {highlight="5,8-10"}
+### functions.py {highlight="5,7-9"}
 from typing import List
 import spacy
 
 @spacy.registry.callbacks("customize_language_data")
 def create_callback(extra_stop_words: List[str] = [], debug: bool = False):
     def customize_language_data(lang_cls):
-        lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
-        lang_cls.Defaults.stop_words.add(extra_stop_words)
+        lang_cls.Defaults.stop_words.update(extra_stop_words)
         if debug:
-            print("Updated stop words and tokenizer suffixes")
+            print("Updated stop words")
         return lang_cls
 
     return customize_language_data
@@ -715,6 +716,65 @@ to your Python file. Before loading the config, spaCy will import the
 $ python -m spacy train config.cfg --output ./output --code ./functions.py
 ```
 
+#### Example: Modifying tokenizer settings {#custom-tokenizer}
+
+Use the `initialize.before_init` callback to modify the tokenizer settings when
+training a new pipeline. Write a registered callback that modifies the tokenizer
+settings and specify this callback in your config:
+
+> #### config.cfg
+>
+> ```ini
+> [initialize]
+>
+> [initialize.before_init]
+> @callbacks = "customize_tokenizer"
+> ```
+
+```python
+### functions.py
+from spacy.util import registry, compile_suffix_regex
+
+@registry.callbacks("customize_tokenizer")
+def make_customize_tokenizer():
+    def customize_tokenizer(nlp):
+        # remove a suffix
+        suffixes = list(nlp.Defaults.suffixes)
+        suffixes.remove("\\[")
+        suffix_regex = compile_suffix_regex(suffixes)
+        nlp.tokenizer.suffix_search = suffix_regex.search
+
+        # add a special case
+        nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
+    return customize_tokenizer
+```
+
+When training, provide the function above with the `--code` option:
+
+```cli
+$ python -m spacy train config.cfg --code ./functions.py
+```
+
+Because this callback is only called in the one-time initialization step before
+training, the callback code does not need to be packaged with the final pipeline
+package. However, to make it easier for others to replicate your training setup,
+you can choose to package the initialization callbacks with the pipeline package
+or to publish them separately.
+
+<Infobox variant="warning" title="nlp.before_creation vs. initialize.before_init">
+
+- `nlp.before_creation` is the best place to modify language defaults other than
+  the tokenizer settings.
+- `initialize.before_init` is the best place to modify tokenizer settings when
+  training a new pipeline.
+
+Unlike the other language defaults, the tokenizer settings are saved with the
+pipeline with `nlp.to_disk()`, so modifications made in `nlp.before_creation`
+will be clobbered by the saved settings when the trained pipeline is loaded from
+disk.
+
+</Infobox>
+
 #### Example: Custom logging function {#custom-logging}
 
 During training, the results of each step are passed to a logger function. By
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 47ddcf53a..9b911b960 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -930,6 +930,55 @@ treebank.
 
 </Project>
 
+#### Modifying tokenizer settings
+
+If you were using a base model with `spacy train` to customize the tokenizer
+settings in v2, your modifications can be provided in the
+`[initialize.before_init]` callback.
+
+Write a registered callback that modifies the tokenizer settings and specify
+this callback in your config:
+
+> #### config.cfg
+>
+> ```ini
+> [initialize]
+>
+> [initialize.before_init]
+> @callbacks = "customize_tokenizer"
+> ```
+
+```python
+### functions.py
+from spacy.util import registry, compile_suffix_regex
+
+@registry.callbacks("customize_tokenizer")
+def make_customize_tokenizer():
+    def customize_tokenizer(nlp):
+        # remove a suffix
+        suffixes = list(nlp.Defaults.suffixes)
+        suffixes.remove("\\[")
+        suffix_regex = compile_suffix_regex(suffixes)
+        nlp.tokenizer.suffix_search = suffix_regex.search
+
+        # add a special case
+        nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
+    return customize_tokenizer
+```
+
+When training, provide the function above with the `--code` option:
+
+```cli
+$ python -m spacy train config.cfg --code ./functions.py
+```
+
+The train step requires the `--code` option with your registered functions from
+the `[initialize]` block, but since those callbacks are only required during the
+initialization step, you don't need to provide them with the final pipeline
+package. However, to make it easier for others to replicate your training setup,
+you can choose to package the initialization callbacks with the pipeline package
+or to publish them separately.
+
 #### Training via the Python API {#migrating-training-python}
 
 For most use cases, you **shouldn't** have to write your own training scripts