mirror of https://github.com/explosion/spaCy.git
Add initialize.before_init and after_init callbacks
Add `initialize.before_init` and `initialize.after_init` callbacks to the config. The `initialize.before_init` callback is a place to implement one-time tokenizer customizations that are then saved with the model.
This commit is contained in:
parent
ad43cbb042
commit
a45d89f09a
|
@ -124,3 +124,5 @@ lookups = null
|
||||||
tokenizer = {}
|
tokenizer = {}
|
||||||
# Arguments for initialize methods of the components (keyed by component)
|
# Arguments for initialize methods of the components (keyed by component)
|
||||||
components = {}
|
components = {}
|
||||||
|
before_init = null
|
||||||
|
after_init = null
|
||||||
|
|
|
@ -1209,6 +1209,9 @@ class Language:
|
||||||
config = self.config.interpolate()
|
config = self.config.interpolate()
|
||||||
# These are the settings provided in the [initialize] block in the config
|
# These are the settings provided in the [initialize] block in the config
|
||||||
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||||
|
before_init = I["before_init"]
|
||||||
|
if before_init is not None:
|
||||||
|
before_init(self)
|
||||||
init_vocab(
|
init_vocab(
|
||||||
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
|
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
|
||||||
)
|
)
|
||||||
|
@ -1240,6 +1243,9 @@ class Language:
|
||||||
self._optimizer = sgd
|
self._optimizer = sgd
|
||||||
elif self._optimizer is None:
|
elif self._optimizer is None:
|
||||||
self._optimizer = self.create_optimizer()
|
self._optimizer = self.create_optimizer()
|
||||||
|
after_init = I["after_init"]
|
||||||
|
if after_init is not None:
|
||||||
|
after_init(self)
|
||||||
return self._optimizer
|
return self._optimizer
|
||||||
|
|
||||||
def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
|
def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
|
||||||
|
|
|
@ -369,6 +369,8 @@ class ConfigSchemaInit(BaseModel):
|
||||||
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
||||||
tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
|
tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
|
||||||
components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component")
|
components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component")
|
||||||
|
before_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object before initialization")
|
||||||
|
after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
|
|
|
@ -166,6 +166,8 @@ def test_language_from_config_before_after_init():
|
||||||
ran_before = False
|
ran_before = False
|
||||||
ran_after = False
|
ran_after = False
|
||||||
ran_after_pipeline = False
|
ran_after_pipeline = False
|
||||||
|
ran_before_init = False
|
||||||
|
ran_after_init = False
|
||||||
|
|
||||||
@registry.callbacks(f"{name}_before")
|
@registry.callbacks(f"{name}_before")
|
||||||
def make_before_creation():
|
def make_before_creation():
|
||||||
|
@ -205,6 +207,26 @@ def test_language_from_config_before_after_init():
|
||||||
|
|
||||||
return after_pipeline_creation
|
return after_pipeline_creation
|
||||||
|
|
||||||
|
@registry.callbacks(f"{name}_before_init")
|
||||||
|
def make_before_init():
|
||||||
|
def before_init(nlp):
|
||||||
|
nonlocal ran_before_init
|
||||||
|
ran_before_init = True
|
||||||
|
nlp.meta["before_init"] = "before"
|
||||||
|
return nlp
|
||||||
|
|
||||||
|
return before_init
|
||||||
|
|
||||||
|
@registry.callbacks(f"{name}_after_init")
|
||||||
|
def make_after_init():
|
||||||
|
def after_init(nlp):
|
||||||
|
nonlocal ran_after_init
|
||||||
|
ran_after_init = True
|
||||||
|
nlp.meta["after_init"] = "after"
|
||||||
|
return nlp
|
||||||
|
|
||||||
|
return after_init
|
||||||
|
|
||||||
config = {
|
config = {
|
||||||
"nlp": {
|
"nlp": {
|
||||||
"pipeline": ["sentencizer"],
|
"pipeline": ["sentencizer"],
|
||||||
|
@ -213,14 +235,23 @@ def test_language_from_config_before_after_init():
|
||||||
"after_pipeline_creation": {"@callbacks": f"{name}_after_pipeline"},
|
"after_pipeline_creation": {"@callbacks": f"{name}_after_pipeline"},
|
||||||
},
|
},
|
||||||
"components": {"sentencizer": {"factory": "sentencizer"}},
|
"components": {"sentencizer": {"factory": "sentencizer"}},
|
||||||
|
"initialize": {
|
||||||
|
"before_init": {"@callbacks": f"{name}_before_init"},
|
||||||
|
"after_init": {"@callbacks": f"{name}_after_init"},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
nlp = English.from_config(config)
|
nlp = English.from_config(config)
|
||||||
assert all([ran_before, ran_after, ran_after_pipeline])
|
|
||||||
assert nlp.Defaults.foo == "bar"
|
assert nlp.Defaults.foo == "bar"
|
||||||
assert nlp.meta["foo"] == "bar"
|
assert nlp.meta["foo"] == "bar"
|
||||||
assert nlp.meta["bar"] == "baz"
|
assert nlp.meta["bar"] == "baz"
|
||||||
|
assert "before_init" not in nlp.meta
|
||||||
|
assert "after_init" not in nlp.meta
|
||||||
assert nlp.pipe_names == ["sentencizer"]
|
assert nlp.pipe_names == ["sentencizer"]
|
||||||
assert nlp("text")
|
assert nlp("text")
|
||||||
|
nlp.initialize()
|
||||||
|
assert nlp.meta["before_init"] == "before"
|
||||||
|
assert nlp.meta["after_init"] == "after"
|
||||||
|
assert all([ran_before, ran_after, ran_after_pipeline, ran_before_init, ran_after_init])
|
||||||
|
|
||||||
|
|
||||||
def test_language_from_config_before_after_init_invalid():
|
def test_language_from_config_before_after_init_invalid():
|
||||||
|
|
|
@ -611,14 +611,16 @@ subclass and language data from scratch – it's often enough to make a few smal
|
||||||
modifications, like adjusting the
|
modifications, like adjusting the
|
||||||
[tokenization rules](/usage/linguistic-features#native-tokenizer-additions) or
|
[tokenization rules](/usage/linguistic-features#native-tokenizer-additions) or
|
||||||
[language defaults](/api/language#defaults) like stop words. The config lets you
|
[language defaults](/api/language#defaults) like stop words. The config lets you
|
||||||
provide three optional **callback functions** that give you access to the
|
provide five optional **callback functions** that give you access to the
|
||||||
language class and `nlp` object at different points of the lifecycle:
|
language class and `nlp` object at different points of the lifecycle:
|
||||||
|
|
||||||
| Callback | Description |
|
| Callback | Description |
|
||||||
| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `before_creation` | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults). |
|
| `nlp.before_creation` | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults) aside from the tokenizer settings. |
|
||||||
| `after_creation` | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. Useful for modifying the tokenizer. |
|
| `nlp.after_creation` | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. |
|
||||||
| `after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components. |
|
| `nlp.after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components. |
|
||||||
|
| `initialize.before_init` | Called before the pipeline components are initialized and receives the `nlp` object for in-place modification. Useful for modifying the tokenizer settings, similar to the v2 base model option. |
|
||||||
|
| `initialize.after_init` | Called after the pipeline components are initialized and receives the `nlp` object for in-place modification. |
|
||||||
|
|
||||||
The `@spacy.registry.callbacks` decorator lets you register your custom function
|
The `@spacy.registry.callbacks` decorator lets you register your custom function
|
||||||
in the `callbacks` [registry](/api/top-level#registry) under a given name. You
|
in the `callbacks` [registry](/api/top-level#registry) under a given name. You
|
||||||
|
@ -626,8 +628,8 @@ can then reference the function in a config block using the `@callbacks` key. If
|
||||||
a block contains a key starting with an `@`, it's interpreted as a reference to
|
a block contains a key starting with an `@`, it's interpreted as a reference to
|
||||||
a function. Because you've registered the function, spaCy knows how to create it
|
a function. Because you've registered the function, spaCy knows how to create it
|
||||||
when you reference `"customize_language_data"` in your config. Here's an example
|
when you reference `"customize_language_data"` in your config. Here's an example
|
||||||
of a callback that runs before the `nlp` object is created and adds a few custom
|
of a callback that runs before the `nlp` object is created and adds a custom
|
||||||
tokenization rules to the defaults:
|
stop word to the defaults:
|
||||||
|
|
||||||
> #### config.cfg
|
> #### config.cfg
|
||||||
>
|
>
|
||||||
|
@ -643,7 +645,7 @@ import spacy
|
||||||
@spacy.registry.callbacks("customize_language_data")
|
@spacy.registry.callbacks("customize_language_data")
|
||||||
def create_callback():
|
def create_callback():
|
||||||
def customize_language_data(lang_cls):
|
def customize_language_data(lang_cls):
|
||||||
lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
|
lang_cls.Defaults.stop_words.add("good")
|
||||||
return lang_cls
|
return lang_cls
|
||||||
|
|
||||||
return customize_language_data
|
return customize_language_data
|
||||||
|
@ -674,17 +676,16 @@ we're adding the arguments `extra_stop_words` (a list of strings) and `debug`
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
### functions.py {highlight="5,8-10"}
|
### functions.py {highlight="5,7-9"}
|
||||||
from typing import List
|
from typing import List
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
@spacy.registry.callbacks("customize_language_data")
|
@spacy.registry.callbacks("customize_language_data")
|
||||||
def create_callback(extra_stop_words: List[str] = [], debug: bool = False):
|
def create_callback(extra_stop_words: List[str] = [], debug: bool = False):
|
||||||
def customize_language_data(lang_cls):
|
def customize_language_data(lang_cls):
|
||||||
lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
|
lang_cls.Defaults.stop_words.update(extra_stop_words)
|
||||||
lang_cls.Defaults.stop_words.add(extra_stop_words)
|
|
||||||
if debug:
|
if debug:
|
||||||
print("Updated stop words and tokenizer suffixes")
|
print("Updated stop words")
|
||||||
return lang_cls
|
return lang_cls
|
||||||
|
|
||||||
return customize_language_data
|
return customize_language_data
|
||||||
|
@ -715,6 +716,65 @@ to your Python file. Before loading the config, spaCy will import the
|
||||||
$ python -m spacy train config.cfg --output ./output --code ./functions.py
|
$ python -m spacy train config.cfg --output ./output --code ./functions.py
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### Example: Modifying tokenizer settings {#custom-tokenizer}
|
||||||
|
|
||||||
|
Use the `initialize.before_init` callback to modify the tokenizer settings when
|
||||||
|
training a new pipeline. Write a registered callback that modifies the tokenizer
|
||||||
|
settings and specify this callback in your config:
|
||||||
|
|
||||||
|
> #### config.cfg
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [initialize]
|
||||||
|
>
|
||||||
|
> [initialize.before_init]
|
||||||
|
> @callbacks = "customize_tokenizer"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```python
|
||||||
|
### functions.py
|
||||||
|
from spacy.util import registry, compile_suffix_regex
|
||||||
|
|
||||||
|
@registry.callbacks("customize_tokenizer")
|
||||||
|
def make_customize_tokenizer():
|
||||||
|
def customize_tokenizer(nlp):
|
||||||
|
# remove a suffix
|
||||||
|
suffixes = list(nlp.Defaults.suffixes)
|
||||||
|
suffixes.remove("\\[")
|
||||||
|
suffix_regex = compile_suffix_regex(suffixes)
|
||||||
|
nlp.tokenizer.suffix_search = suffix_regex.search
|
||||||
|
|
||||||
|
# add a special case
|
||||||
|
nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
|
||||||
|
return customize_tokenizer
|
||||||
|
```
|
||||||
|
|
||||||
|
When training, provide the function above with the `--code` option:
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy train config.cfg --code ./functions.py
|
||||||
|
```
|
||||||
|
|
||||||
|
Because this callback is only called in the one-time initialization step before
|
||||||
|
training, the callback code does not need to be packaged with the final pipeline
|
||||||
|
package. However, to make it easier for others to replicate your training setup,
|
||||||
|
you can choose to package the initialization callbacks with the pipeline package
|
||||||
|
or to publish them separately.
|
||||||
|
|
||||||
|
<Infobox variant="warning" title="nlp.before_creation vs. initialize.before_init">
|
||||||
|
|
||||||
|
- `nlp.before_creation` is the best place to modify language defaults other than
|
||||||
|
the tokenizer settings.
|
||||||
|
- `initialize.before_init` is the best place to modify tokenizer settings when
|
||||||
|
training a new pipeline.
|
||||||
|
|
||||||
|
Unlike the other language defaults, the tokenizer settings are saved with the
|
||||||
|
pipeline with `nlp.to_disk()`, so modifications made in `nlp.before_creation`
|
||||||
|
will be clobbered by the saved settings when the trained pipeline is loaded from
|
||||||
|
disk.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
#### Example: Custom logging function {#custom-logging}
|
#### Example: Custom logging function {#custom-logging}
|
||||||
|
|
||||||
During training, the results of each step are passed to a logger function. By
|
During training, the results of each step are passed to a logger function. By
|
||||||
|
@ -1060,7 +1120,7 @@ In this example we assume a custom function `read_custom_data` which loads or
|
||||||
generates texts with relevant text classification annotations. Then, small
|
generates texts with relevant text classification annotations. Then, small
|
||||||
lexical variations of the input text are created before generating the final
|
lexical variations of the input text are created before generating the final
|
||||||
[`Example`](/api/example) objects. The `@spacy.registry.readers` decorator lets
|
[`Example`](/api/example) objects. The `@spacy.registry.readers` decorator lets
|
||||||
you register the function creating the custom reader in the `readers`
|
you register the function creating the custom reader in the `readers`
|
||||||
[registry](/api/top-level#registry) and assign it a string name, so it can be
|
[registry](/api/top-level#registry) and assign it a string name, so it can be
|
||||||
used in your config. All arguments on the registered function become available
|
used in your config. All arguments on the registered function become available
|
||||||
as **config settings** – in this case, `source`.
|
as **config settings** – in this case, `source`.
|
||||||
|
|
|
@ -930,6 +930,55 @@ treebank.
|
||||||
|
|
||||||
</Project>
|
</Project>
|
||||||
|
|
||||||
|
#### Modifying tokenizer settings
|
||||||
|
|
||||||
|
If you were using a base model with `spacy train` to customize the tokenizer
|
||||||
|
settings in v2, your modifications can be provided in the
|
||||||
|
`[initialize.before_init]` callback.
|
||||||
|
|
||||||
|
Write a registered callback that modifies the tokenizer settings and specify
|
||||||
|
this callback in your config:
|
||||||
|
|
||||||
|
> #### config.cfg
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [initialize]
|
||||||
|
>
|
||||||
|
> [initialize.before_init]
|
||||||
|
> @callbacks = "customize_tokenizer"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```python
|
||||||
|
### functions.py
|
||||||
|
from spacy.util import registry, compile_suffix_regex
|
||||||
|
|
||||||
|
@registry.callbacks("customize_tokenizer")
|
||||||
|
def make_customize_tokenizer():
|
||||||
|
def customize_tokenizer(nlp):
|
||||||
|
# remove a suffix
|
||||||
|
suffixes = list(nlp.Defaults.suffixes)
|
||||||
|
suffixes.remove("\\[")
|
||||||
|
suffix_regex = compile_suffix_regex(suffixes)
|
||||||
|
nlp.tokenizer.suffix_search = suffix_regex.search
|
||||||
|
|
||||||
|
# add a special case
|
||||||
|
nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
|
||||||
|
return customize_tokenizer
|
||||||
|
```
|
||||||
|
|
||||||
|
When training, provide the function above with the `--code` option:
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy train config.cfg --code ./functions.py
|
||||||
|
```
|
||||||
|
|
||||||
|
The train step requires the `--code` option with your registered functions from
|
||||||
|
the `[initialize]` block, but since those callbacks are only required during the
|
||||||
|
initialization step, you don't need to provide them with the final pipeline
|
||||||
|
package. However, to make it easier for others to replicate your training setup,
|
||||||
|
you can choose to package the initialization callbacks with the pipeline package
|
||||||
|
or to publish them separately.
|
||||||
|
|
||||||
#### Training via the Python API {#migrating-training-python}
|
#### Training via the Python API {#migrating-training-python}
|
||||||
|
|
||||||
For most use cases, you **shouldn't** have to write your own training scripts
|
For most use cases, you **shouldn't** have to write your own training scripts
|
||||||
|
|
Loading…
Reference in New Issue