mirror of https://github.com/explosion/spaCy.git
Add initialize.before_init and after_init callbacks
Add `initialize.before_init` and `initialize.after_init` callbacks to the config. The `initialize.before_init` callback is a place to implement one-time tokenizer customizations that are then saved with the model.
This commit is contained in:
parent
ad43cbb042
commit
a45d89f09a
|
@ -124,3 +124,5 @@ lookups = null
|
|||
tokenizer = {}
|
||||
# Arguments for initialize methods of the components (keyed by component)
|
||||
components = {}
|
||||
before_init = null
|
||||
after_init = null
|
||||
|
|
|
@ -1209,6 +1209,9 @@ class Language:
|
|||
config = self.config.interpolate()
|
||||
# These are the settings provided in the [initialize] block in the config
|
||||
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||
before_init = I["before_init"]
|
||||
if before_init is not None:
|
||||
before_init(self)
|
||||
init_vocab(
|
||||
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
|
||||
)
|
||||
|
@ -1240,6 +1243,9 @@ class Language:
|
|||
self._optimizer = sgd
|
||||
elif self._optimizer is None:
|
||||
self._optimizer = self.create_optimizer()
|
||||
after_init = I["after_init"]
|
||||
if after_init is not None:
|
||||
after_init(self)
|
||||
return self._optimizer
|
||||
|
||||
def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
|
||||
|
|
|
@ -369,6 +369,8 @@ class ConfigSchemaInit(BaseModel):
|
|||
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
||||
tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
|
||||
components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component")
|
||||
before_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object before initialization")
|
||||
after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization")
|
||||
# fmt: on
|
||||
|
||||
class Config:
|
||||
|
|
|
@ -166,6 +166,8 @@ def test_language_from_config_before_after_init():
|
|||
ran_before = False
|
||||
ran_after = False
|
||||
ran_after_pipeline = False
|
||||
ran_before_init = False
|
||||
ran_after_init = False
|
||||
|
||||
@registry.callbacks(f"{name}_before")
|
||||
def make_before_creation():
|
||||
|
@ -205,6 +207,26 @@ def test_language_from_config_before_after_init():
|
|||
|
||||
return after_pipeline_creation
|
||||
|
||||
@registry.callbacks(f"{name}_before_init")
|
||||
def make_before_init():
|
||||
def before_init(nlp):
|
||||
nonlocal ran_before_init
|
||||
ran_before_init = True
|
||||
nlp.meta["before_init"] = "before"
|
||||
return nlp
|
||||
|
||||
return before_init
|
||||
|
||||
@registry.callbacks(f"{name}_after_init")
|
||||
def make_after_init():
|
||||
def after_init(nlp):
|
||||
nonlocal ran_after_init
|
||||
ran_after_init = True
|
||||
nlp.meta["after_init"] = "after"
|
||||
return nlp
|
||||
|
||||
return after_init
|
||||
|
||||
config = {
|
||||
"nlp": {
|
||||
"pipeline": ["sentencizer"],
|
||||
|
@ -213,14 +235,23 @@ def test_language_from_config_before_after_init():
|
|||
"after_pipeline_creation": {"@callbacks": f"{name}_after_pipeline"},
|
||||
},
|
||||
"components": {"sentencizer": {"factory": "sentencizer"}},
|
||||
"initialize": {
|
||||
"before_init": {"@callbacks": f"{name}_before_init"},
|
||||
"after_init": {"@callbacks": f"{name}_after_init"},
|
||||
},
|
||||
}
|
||||
nlp = English.from_config(config)
|
||||
assert all([ran_before, ran_after, ran_after_pipeline])
|
||||
assert nlp.Defaults.foo == "bar"
|
||||
assert nlp.meta["foo"] == "bar"
|
||||
assert nlp.meta["bar"] == "baz"
|
||||
assert "before_init" not in nlp.meta
|
||||
assert "after_init" not in nlp.meta
|
||||
assert nlp.pipe_names == ["sentencizer"]
|
||||
assert nlp("text")
|
||||
nlp.initialize()
|
||||
assert nlp.meta["before_init"] == "before"
|
||||
assert nlp.meta["after_init"] == "after"
|
||||
assert all([ran_before, ran_after, ran_after_pipeline, ran_before_init, ran_after_init])
|
||||
|
||||
|
||||
def test_language_from_config_before_after_init_invalid():
|
||||
|
|
|
@ -611,14 +611,16 @@ subclass and language data from scratch – it's often enough to make a few smal
|
|||
modifications, like adjusting the
|
||||
[tokenization rules](/usage/linguistic-features#native-tokenizer-additions) or
|
||||
[language defaults](/api/language#defaults) like stop words. The config lets you
|
||||
provide three optional **callback functions** that give you access to the
|
||||
provide five optional **callback functions** that give you access to the
|
||||
language class and `nlp` object at different points of the lifecycle:
|
||||
|
||||
| Callback | Description |
|
||||
| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `before_creation` | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults). |
|
||||
| `after_creation` | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. Useful for modifying the tokenizer. |
|
||||
| `after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components. |
|
||||
| Callback | Description |
|
||||
| ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `nlp.before_creation` | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults) aside from the tokenizer settings. |
|
||||
| `nlp.after_creation` | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. |
|
||||
| `nlp.after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components. |
|
||||
| `initialize.before_init` | Called before the pipeline components are initialized and receives the `nlp` object for in-place modification. Useful for modifying the tokenizer settings, similar to the v2 base model option. |
|
||||
| `initialize.after_init` | Called after the pipeline components are initialized and receives the `nlp` object for in-place modification. |
|
||||
|
||||
The `@spacy.registry.callbacks` decorator lets you register your custom function
|
||||
in the `callbacks` [registry](/api/top-level#registry) under a given name. You
|
||||
|
@ -626,8 +628,8 @@ can then reference the function in a config block using the `@callbacks` key. If
|
|||
a block contains a key starting with an `@`, it's interpreted as a reference to
|
||||
a function. Because you've registered the function, spaCy knows how to create it
|
||||
when you reference `"customize_language_data"` in your config. Here's an example
|
||||
of a callback that runs before the `nlp` object is created and adds a few custom
|
||||
tokenization rules to the defaults:
|
||||
of a callback that runs before the `nlp` object is created and adds a custom
|
||||
stop word to the defaults:
|
||||
|
||||
> #### config.cfg
|
||||
>
|
||||
|
@ -643,7 +645,7 @@ import spacy
|
|||
@spacy.registry.callbacks("customize_language_data")
|
||||
def create_callback():
|
||||
def customize_language_data(lang_cls):
|
||||
lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
|
||||
lang_cls.Defaults.stop_words.add("good")
|
||||
return lang_cls
|
||||
|
||||
return customize_language_data
|
||||
|
@ -674,17 +676,16 @@ we're adding the arguments `extra_stop_words` (a list of strings) and `debug`
|
|||
> ```
|
||||
|
||||
```python
|
||||
### functions.py {highlight="5,8-10"}
|
||||
### functions.py {highlight="5,7-9"}
|
||||
from typing import List
|
||||
import spacy
|
||||
|
||||
@spacy.registry.callbacks("customize_language_data")
|
||||
def create_callback(extra_stop_words: List[str] = [], debug: bool = False):
|
||||
def customize_language_data(lang_cls):
|
||||
lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
|
||||
lang_cls.Defaults.stop_words.add(extra_stop_words)
|
||||
lang_cls.Defaults.stop_words.update(extra_stop_words)
|
||||
if debug:
|
||||
print("Updated stop words and tokenizer suffixes")
|
||||
print("Updated stop words")
|
||||
return lang_cls
|
||||
|
||||
return customize_language_data
|
||||
|
@ -715,6 +716,65 @@ to your Python file. Before loading the config, spaCy will import the
|
|||
$ python -m spacy train config.cfg --output ./output --code ./functions.py
|
||||
```
|
||||
|
||||
#### Example: Modifying tokenizer settings {#custom-tokenizer}
|
||||
|
||||
Use the `initialize.before_init` callback to modify the tokenizer settings when
|
||||
training a new pipeline. Write a registered callback that modifies the tokenizer
|
||||
settings and specify this callback in your config:
|
||||
|
||||
> #### config.cfg
|
||||
>
|
||||
> ```ini
|
||||
> [initialize]
|
||||
>
|
||||
> [initialize.before_init]
|
||||
> @callbacks = "customize_tokenizer"
|
||||
> ```
|
||||
|
||||
```python
|
||||
### functions.py
|
||||
from spacy.util import registry, compile_suffix_regex
|
||||
|
||||
@registry.callbacks("customize_tokenizer")
|
||||
def make_customize_tokenizer():
|
||||
def customize_tokenizer(nlp):
|
||||
# remove a suffix
|
||||
suffixes = list(nlp.Defaults.suffixes)
|
||||
suffixes.remove("\\[")
|
||||
suffix_regex = compile_suffix_regex(suffixes)
|
||||
nlp.tokenizer.suffix_search = suffix_regex.search
|
||||
|
||||
# add a special case
|
||||
nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
|
||||
return customize_tokenizer
|
||||
```
|
||||
|
||||
When training, provide the function above with the `--code` option:
|
||||
|
||||
```cli
|
||||
$ python -m spacy train config.cfg --code ./functions.py
|
||||
```
|
||||
|
||||
Because this callback is only called in the one-time initialization step before
|
||||
training, the callback code does not need to be packaged with the final pipeline
|
||||
package. However, to make it easier for others to replicate your training setup,
|
||||
you can choose to package the initialization callbacks with the pipeline package
|
||||
or to publish them separately.
|
||||
|
||||
<Infobox variant="warning" title="nlp.before_creation vs. initialize.before_init">
|
||||
|
||||
- `nlp.before_creation` is the best place to modify language defaults other than
|
||||
the tokenizer settings.
|
||||
- `initialize.before_init` is the best place to modify tokenizer settings when
|
||||
training a new pipeline.
|
||||
|
||||
Unlike the other language defaults, the tokenizer settings are saved with the
|
||||
pipeline with `nlp.to_disk()`, so modifications made in `nlp.before_creation`
|
||||
will be clobbered by the saved settings when the trained pipeline is loaded from
|
||||
disk.
|
||||
|
||||
</Infobox>
|
||||
|
||||
#### Example: Custom logging function {#custom-logging}
|
||||
|
||||
During training, the results of each step are passed to a logger function. By
|
||||
|
@ -1060,7 +1120,7 @@ In this example we assume a custom function `read_custom_data` which loads or
|
|||
generates texts with relevant text classification annotations. Then, small
|
||||
lexical variations of the input text are created before generating the final
|
||||
[`Example`](/api/example) objects. The `@spacy.registry.readers` decorator lets
|
||||
you register the function creating the custom reader in the `readers`
|
||||
you register the function creating the custom reader in the `readers`
|
||||
[registry](/api/top-level#registry) and assign it a string name, so it can be
|
||||
used in your config. All arguments on the registered function become available
|
||||
as **config settings** – in this case, `source`.
|
||||
|
|
|
@ -930,6 +930,55 @@ treebank.
|
|||
|
||||
</Project>
|
||||
|
||||
#### Modifying tokenizer settings
|
||||
|
||||
If you were using a base model with `spacy train` to customize the tokenizer
|
||||
settings in v2, your modifications can be provided in the
|
||||
`[initialize.before_init]` callback.
|
||||
|
||||
Write a registered callback that modifies the tokenizer settings and specify
|
||||
this callback in your config:
|
||||
|
||||
> #### config.cfg
|
||||
>
|
||||
> ```ini
|
||||
> [initialize]
|
||||
>
|
||||
> [initialize.before_init]
|
||||
> @callbacks = "customize_tokenizer"
|
||||
> ```
|
||||
|
||||
```python
|
||||
### functions.py
|
||||
from spacy.util import registry, compile_suffix_regex
|
||||
|
||||
@registry.callbacks("customize_tokenizer")
|
||||
def make_customize_tokenizer():
|
||||
def customize_tokenizer(nlp):
|
||||
# remove a suffix
|
||||
suffixes = list(nlp.Defaults.suffixes)
|
||||
suffixes.remove("\\[")
|
||||
suffix_regex = compile_suffix_regex(suffixes)
|
||||
nlp.tokenizer.suffix_search = suffix_regex.search
|
||||
|
||||
# add a special case
|
||||
nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
|
||||
return customize_tokenizer
|
||||
```
|
||||
|
||||
When training, provide the function above with the `--code` option:
|
||||
|
||||
```cli
|
||||
$ python -m spacy train config.cfg --code ./functions.py
|
||||
```
|
||||
|
||||
The train step requires the `--code` option with your registered functions from
|
||||
the `[initialize]` block, but since those callbacks are only required during the
|
||||
initialization step, you don't need to provide them with the final pipeline
|
||||
package. However, to make it easier for others to replicate your training setup,
|
||||
you can choose to package the initialization callbacks with the pipeline package
|
||||
or to publish them separately.
|
||||
|
||||
#### Training via the Python API {#migrating-training-python}
|
||||
|
||||
For most use cases, you **shouldn't** have to write your own training scripts
|
||||
|
|
Loading…
Reference in New Issue