Add initialize.before_init and after_init callbacks

Add `initialize.before_init` and `initialize.after_init` callbacks to the config. The `initialize.before_init` callback is a place to implement one-time tokenizer customizations that are then saved with the model.
2021-01-12 11:29:31 +01:00 · 2021-01-12 11:29:31 +01:00 · a45d89f09a
parent ad43cbb042
commit a45d89f09a
6 changed files with 165 additions and 15 deletions
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -124,3 +124,5 @@ lookups = null
 tokenizer = {}
 # Arguments for initialize methods of the components (keyed by component)
 components = {}
 before_init = null
 after_init = null
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1209,6 +1209,9 @@ class Language:
        config = self.config.interpolate()
        # These are the settings provided in the [initialize] block in the config
        I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
        before_init = I["before_init"]
        if before_init is not None:
            before_init(self)
        init_vocab(
            self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
        )
@ -1240,6 +1243,9 @@ class Language:
            self._optimizer = sgd
        elif self._optimizer is None:
            self._optimizer = self.create_optimizer()
        after_init = I["after_init"]
        if after_init is not None:
            after_init(self)
        return self._optimizer
    def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -369,6 +369,8 @@ class ConfigSchemaInit(BaseModel):
    init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
    tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
    components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component")
    before_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object before initialization")
    after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization")
    # fmt: on
    class Config:
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -166,6 +166,8 @@ def test_language_from_config_before_after_init():
    ran_before = False
    ran_after = False
    ran_after_pipeline = False
    ran_before_init = False
    ran_after_init = False
    @registry.callbacks(f"{name}_before")
    def make_before_creation():
@ -205,6 +207,26 @@ def test_language_from_config_before_after_init():
        return after_pipeline_creation
    @registry.callbacks(f"{name}_before_init")
    def make_before_init():
        def before_init(nlp):
            nonlocal ran_before_init
            ran_before_init = True
            nlp.meta["before_init"] = "before"
            return nlp
        return before_init
    @registry.callbacks(f"{name}_after_init")
    def make_after_init():
        def after_init(nlp):
            nonlocal ran_after_init
            ran_after_init = True
            nlp.meta["after_init"] = "after"
            return nlp
        return after_init
    config = {
        "nlp": {
            "pipeline": ["sentencizer"],
@ -213,14 +235,23 @@ def test_language_from_config_before_after_init():
            "after_pipeline_creation": {"@callbacks": f"{name}_after_pipeline"},
        },
        "components": {"sentencizer": {"factory": "sentencizer"}},
        "initialize": {
            "before_init": {"@callbacks": f"{name}_before_init"},
            "after_init": {"@callbacks": f"{name}_after_init"},
        },
    }
    nlp = English.from_config(config)
    assert all([ran_before, ran_after, ran_after_pipeline])
    assert nlp.Defaults.foo == "bar"
    assert nlp.meta["foo"] == "bar"
    assert nlp.meta["bar"] == "baz"
    assert "before_init" not in nlp.meta
    assert "after_init" not in nlp.meta
    assert nlp.pipe_names == ["sentencizer"]
    assert nlp("text")
    nlp.initialize()
    assert nlp.meta["before_init"] == "before"
    assert nlp.meta["after_init"] == "after"
    assert all([ran_before, ran_after, ran_after_pipeline, ran_before_init, ran_after_init])
 def test_language_from_config_before_after_init_invalid():
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -611,14 +611,16 @@ subclass and language data from scratch – it's often enough to make a few smal
 modifications, like adjusting the
 [tokenization rules](/usage/linguistic-features#native-tokenizer-additions) or
 [language defaults](/api/language#defaults) like stop words. The config lets you
-provide three optional **callback functions** that give you access to the
+provide five optional **callback functions** that give you access to the
 language class and `nlp` object at different points of the lifecycle:
-| Callback                  | Description                                                                                                                                                                              |
+| Callback                      | Description                                                                                                                                                                                                                |
-| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `before_creation`         | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults). |
+| `nlp.before_creation`         | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults) aside from the tokenizer settings. |
-| `after_creation`          | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. Useful for modifying the tokenizer.          |
+| `nlp.after_creation`          | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object.                                                                                |
-| `after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components.                                                |
+| `nlp.after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components.                                                                                  |
 | `initialize.before_init`      | Called before the pipeline components are initialized and receives the `nlp` object for in-place modification. Useful for modifying the tokenizer settings, similar to the v2 base model option.                           |
 | `initialize.after_init`       | Called after the pipeline components are initialized and receives the `nlp` object for in-place modification.                                                                                                              |
 The `@spacy.registry.callbacks` decorator lets you register your custom function
 in the `callbacks` [registry](/api/top-level#registry) under a given name. You
@ -626,8 +628,8 @@ can then reference the function in a config block using the `@callbacks` key. If
 a block contains a key starting with an `@`, it's interpreted as a reference to
 a function. Because you've registered the function, spaCy knows how to create it
 when you reference `"customize_language_data"` in your config. Here's an example
-of a callback that runs before the `nlp` object is created and adds a few custom
+of a callback that runs before the `nlp` object is created and adds a custom
-tokenization rules to the defaults:
+stop word to the defaults:
 > #### config.cfg
 >
@ -643,7 +645,7 @@ import spacy
@spacy.registry.callbacks("customize_language_data")
 def create_callback():
    def customize_language_data(lang_cls):
-        lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
+        lang_cls.Defaults.stop_words.add("good")
        return lang_cls
    return customize_language_data
@ -674,17 +676,16 @@ we're adding the arguments `extra_stop_words` (a list of strings) and `debug`
 > ```
 ```python
-### functions.py {highlight="5,8-10"}
+### functions.py {highlight="5,7-9"}
 from typing import List
 import spacy
@spacy.registry.callbacks("customize_language_data")
 def create_callback(extra_stop_words: List[str] = [], debug: bool = False):
    def customize_language_data(lang_cls):
-        lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
+        lang_cls.Defaults.stop_words.update(extra_stop_words)
        lang_cls.Defaults.stop_words.add(extra_stop_words)
        if debug:
-            print("Updated stop words and tokenizer suffixes")
+            print("Updated stop words")
        return lang_cls
    return customize_language_data
@ -715,6 +716,65 @@ to your Python file. Before loading the config, spaCy will import the
 $ python -m spacy train config.cfg --output ./output --code ./functions.py
 ```
 #### Example: Modifying tokenizer settings {#custom-tokenizer}
 Use the `initialize.before_init` callback to modify the tokenizer settings when
 training a new pipeline. Write a registered callback that modifies the tokenizer
 settings and specify this callback in your config:
 > #### config.cfg
 >
 > ```ini
 > [initialize]
 >
 > [initialize.before_init]
 > @callbacks = "customize_tokenizer"
 > ```
 ```python
 ### functions.py
 from spacy.util import registry, compile_suffix_regex
@registry.callbacks("customize_tokenizer")
 def make_customize_tokenizer():
    def customize_tokenizer(nlp):
        # remove a suffix
        suffixes = list(nlp.Defaults.suffixes)
        suffixes.remove("\\[")
        suffix_regex = compile_suffix_regex(suffixes)
        nlp.tokenizer.suffix_search = suffix_regex.search
        # add a special case
        nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
    return customize_tokenizer
 ```
 When training, provide the function above with the `--code` option:
 ```cli
 $ python -m spacy train config.cfg --code ./functions.py
 ```
 Because this callback is only called in the one-time initialization step before
 training, the callback code does not need to be packaged with the final pipeline
 package. However, to make it easier for others to replicate your training setup,
 you can choose to package the initialization callbacks with the pipeline package
 or to publish them separately.
 <Infobox variant="warning" title="nlp.before_creation vs. initialize.before_init">
 - `nlp.before_creation` is the best place to modify language defaults other than
  the tokenizer settings.
 - `initialize.before_init` is the best place to modify tokenizer settings when
  training a new pipeline.
 Unlike the other language defaults, the tokenizer settings are saved with the
 pipeline with `nlp.to_disk()`, so modifications made in `nlp.before_creation`
 will be clobbered by the saved settings when the trained pipeline is loaded from
 disk.
 </Infobox>
 #### Example: Custom logging function {#custom-logging}
 During training, the results of each step are passed to a logger function. By
@ -1060,7 +1120,7 @@ In this example we assume a custom function `read_custom_data` which loads or
 generates texts with relevant text classification annotations. Then, small
 lexical variations of the input text are created before generating the final
 [`Example`](/api/example) objects. The `@spacy.registry.readers` decorator lets
-you register the function creating the custom reader in the `readers` 
+you register the function creating the custom reader in the `readers`
 [registry](/api/top-level#registry) and assign it a string name, so it can be
 used in your config. All arguments on the registered function become available
 as **config settings** – in this case, `source`.
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -930,6 +930,55 @@ treebank.
 </Project>
 #### Modifying tokenizer settings
 If you were using a base model with `spacy train` to customize the tokenizer
 settings in v2, your modifications can be provided in the
 `[initialize.before_init]` callback.
 Write a registered callback that modifies the tokenizer settings and specify
 this callback in your config:
 > #### config.cfg
 >
 > ```ini
 > [initialize]
 >
 > [initialize.before_init]
 > @callbacks = "customize_tokenizer"
 > ```
 ```python
 ### functions.py
 from spacy.util import registry, compile_suffix_regex
@registry.callbacks("customize_tokenizer")
 def make_customize_tokenizer():
    def customize_tokenizer(nlp):
        # remove a suffix
        suffixes = list(nlp.Defaults.suffixes)
        suffixes.remove("\\[")
        suffix_regex = compile_suffix_regex(suffixes)
        nlp.tokenizer.suffix_search = suffix_regex.search
        # add a special case
        nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
    return customize_tokenizer
 ```
 When training, provide the function above with the `--code` option:
 ```cli
 $ python -m spacy train config.cfg --code ./functions.py
 ```
 The train step requires the `--code` option with your registered functions from
 the `[initialize]` block, but since those callbacks are only required during the
 initialization step, you don't need to provide them with the final pipeline
 package. However, to make it easier for others to replicate your training setup,
 you can choose to package the initialization callbacks with the pipeline package
 or to publish them separately.
 #### Training via the Python API {#migrating-training-python}
 For most use cases, you **shouldn't** have to write your own training scripts