diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index c9f82caa0..0f7226083 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -124,3 +124,5 @@ lookups = null tokenizer = {} # Arguments for initialize methods of the components (keyed by component) components = {} +before_init = null +after_init = null diff --git a/spacy/language.py b/spacy/language.py index f695ddc9e..91f4b99d4 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1209,6 +1209,9 @@ class Language: config = self.config.interpolate() # These are the settings provided in the [initialize] block in the config I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) + before_init = I["before_init"] + if before_init is not None: + before_init(self) init_vocab( self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"] ) @@ -1240,6 +1243,9 @@ class Language: self._optimizer = sgd elif self._optimizer is None: self._optimizer = self.create_optimizer() + after_init = I["after_init"] + if after_init is not None: + after_init(self) return self._optimizer def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer: diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 611eef033..d3f8c339c 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -256,8 +256,14 @@ class Tagger(TrainablePipe): DOCS: https://nightly.spacy.io/api/tagger#get_loss """ validate_examples(examples, "Tagger.get_loss") - loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, missing_value="") - truths = [eg.get_aligned("TAG", as_string=True) for eg in examples] + loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False) + # Convert empty tag "" to missing value None so that both misaligned + # tokens and tokens with missing annotation have the default missing + # value None. + truths = [] + for eg in examples: + eg_truths = [tag if tag is not "" else None for tag in eg.get_aligned("TAG", as_string=True)] + truths.append(eg_truths) d_scores, loss = loss_func(scores, truths) if self.model.ops.xp.isnan(loss): raise ValueError(Errors.E910.format(name=self.name)) diff --git a/spacy/schemas.py b/spacy/schemas.py index 3ea611287..d041845f3 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -369,6 +369,8 @@ class ConfigSchemaInit(BaseModel): init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize") components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component") + before_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object before initialization") + after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization") # fmt: on class Config: diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 6f75c52a4..282961755 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -37,7 +37,16 @@ TRAIN_DATA = [ ] PARTIAL_DATA = [ + # partial annotation ("I like green eggs", {"tags": ["", "V", "J", ""]}), + # misaligned partial annotation + ( + "He hates green eggs", + { + "words": ["He", "hate", "s", "green", "eggs"], + "tags": ["", "V", "S", "J", ""], + }, + ), ] diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 42654ff7d..6709defb8 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -8,6 +8,7 @@ from spacy.util import registry, load_model_from_config, load_config from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder from spacy.schemas import ConfigSchema, ConfigSchemaPretrain +from catalogue import RegistryError from ..util import make_tempdir @@ -446,3 +447,21 @@ def test_config_validate_literal(parser_config_string): nlp.add_pipe("parser", config=config) config["model"]["state_type"] = "ner" nlp.add_pipe("parser", config=config) + + +def test_config_only_resolve_relevant_blocks(): + """Test that only the relevant blocks are resolved in the different methods + and that invalid blocks are ignored if needed. For instance, the [initialize] + shouldn't be resolved at runtime. + """ + nlp = English() + config = nlp.config + config["training"]["before_to_disk"] = {"@misc": "nonexistent"} + config["initialize"]["lookups"] = {"@misc": "nonexistent"} + # This shouldn't resolve [training] or [initialize] + nlp = load_model_from_config(config, auto_fill=True) + # This will raise for nonexistent value + with pytest.raises(RegistryError): + nlp.initialize() + nlp.config["initialize"]["lookups"] = None + nlp.initialize() diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 4fbcaee9e..6ffeeadce 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -166,6 +166,8 @@ def test_language_from_config_before_after_init(): ran_before = False ran_after = False ran_after_pipeline = False + ran_before_init = False + ran_after_init = False @registry.callbacks(f"{name}_before") def make_before_creation(): @@ -205,6 +207,26 @@ def test_language_from_config_before_after_init(): return after_pipeline_creation + @registry.callbacks(f"{name}_before_init") + def make_before_init(): + def before_init(nlp): + nonlocal ran_before_init + ran_before_init = True + nlp.meta["before_init"] = "before" + return nlp + + return before_init + + @registry.callbacks(f"{name}_after_init") + def make_after_init(): + def after_init(nlp): + nonlocal ran_after_init + ran_after_init = True + nlp.meta["after_init"] = "after" + return nlp + + return after_init + config = { "nlp": { "pipeline": ["sentencizer"], @@ -213,14 +235,23 @@ def test_language_from_config_before_after_init(): "after_pipeline_creation": {"@callbacks": f"{name}_after_pipeline"}, }, "components": {"sentencizer": {"factory": "sentencizer"}}, + "initialize": { + "before_init": {"@callbacks": f"{name}_before_init"}, + "after_init": {"@callbacks": f"{name}_after_init"}, + }, } nlp = English.from_config(config) - assert all([ran_before, ran_after, ran_after_pipeline]) assert nlp.Defaults.foo == "bar" assert nlp.meta["foo"] == "bar" assert nlp.meta["bar"] == "baz" + assert "before_init" not in nlp.meta + assert "after_init" not in nlp.meta assert nlp.pipe_names == ["sentencizer"] assert nlp("text") + nlp.initialize() + assert nlp.meta["before_init"] == "before" + assert nlp.meta["after_init"] == "after" + assert all([ran_before, ran_after, ran_after_pipeline, ran_before_init, ran_after_init]) def test_language_from_config_before_after_init_invalid(): diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 577c80cb3..fe2d4b18f 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -59,6 +59,19 @@ def train( batcher = T["batcher"] train_logger = T["logger"] before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) + + # Helper function to save checkpoints. This is a closure for convenience, + # to avoid passing in all the args all the time. + def save_checkpoint(is_best): + with nlp.use_params(optimizer.averages): + before_to_disk(nlp).to_disk(output_path / DIR_MODEL_LAST) + if is_best: + # Avoid saving twice (saving will be more expensive than + # the dir copy) + if (output_path / DIR_MODEL_BEST).exists(): + shutil.rmtree(output_path / DIR_MODEL_BEST) + shutil.copytree(output_path / DIR_MODEL_LAST, output_path / DIR_MODEL_BEST) + # Components that shouldn't be updated during training frozen_components = T["frozen_components"] # Create iterator, which yields out info after each optimization step. @@ -87,40 +100,31 @@ def train( if is_best_checkpoint is not None and output_path is not None: with nlp.select_pipes(disable=frozen_components): update_meta(T, nlp, info) - with nlp.use_params(optimizer.averages): - nlp = before_to_disk(nlp) - nlp.to_disk(output_path / DIR_MODEL_LAST) - if is_best_checkpoint: - with nlp.use_params(optimizer.averages): - nlp.to_disk(output_path / DIR_MODEL_BEST) - + save_checkpoint(is_best_checkpoint) except Exception as e: if output_path is not None: - # We don't want to swallow the traceback if we don't have a - # specific error, but we do want to warn that we're trying - # to do something here. stdout.write( msg.warn( f"Aborting and saving the final best model. " - f"Encountered exception: {str(e)}" + f"Encountered exception: {repr(e)}" ) + "\n" ) raise e finally: finalize_logger() - if optimizer.averages: - nlp.use_params(optimizer.averages) - if output_path is not None: - final_model_path = output_path / DIR_MODEL_LAST - nlp.to_disk(final_model_path) - # This will only run if we don't hit an error - stdout.write( - msg.good("Saved pipeline to output directory", final_model_path) + "\n" - ) - return (nlp, final_model_path) - else: - return (nlp, None) + save_checkpoint(False) + # This will only run if we did't hit an error + if optimizer.averages: + nlp.use_params(optimizer.averages) + if output_path is not None: + stdout.write( + msg.good("Saved pipeline to output directory", output_path / DIR_MODEL_LAST) + + "\n" + ) + return (nlp, output_path / DIR_MODEL_LAST) + else: + return (nlp, None) def train_while_improving( diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index d0a2ac819..33f647413 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -716,7 +716,7 @@ that we want to classify as being related or not. As these candidate pairs are typically formed within one document, this function takes a [`Doc`](/api/doc) as input and outputs a `List` of `Span` tuples. For instance, the following implementation takes any two entities from the same document, as long as they -are within a **maximum distance** (in number of tokens) of eachother: +are within a **maximum distance** (in number of tokens) of each other: > #### config.cfg (excerpt) > @@ -742,7 +742,7 @@ def create_instances(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]] return get_candidates ``` -This function in added to the [`@misc` registry](/api/top-level#registry) so we +This function is added to the [`@misc` registry](/api/top-level#registry) so we can refer to it from the config, and easily swap it out for any other candidate generation function. diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index f8e502966..ad5bec92a 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -611,14 +611,16 @@ subclass and language data from scratch – it's often enough to make a few smal modifications, like adjusting the [tokenization rules](/usage/linguistic-features#native-tokenizer-additions) or [language defaults](/api/language#defaults) like stop words. The config lets you -provide three optional **callback functions** that give you access to the +provide five optional **callback functions** that give you access to the language class and `nlp` object at different points of the lifecycle: -| Callback | Description | -| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `before_creation` | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults). | -| `after_creation` | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. Useful for modifying the tokenizer. | -| `after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components. | +| Callback | Description | +| ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `nlp.before_creation` | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults) aside from the tokenizer settings. | +| `nlp.after_creation` | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. | +| `nlp.after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components. | +| `initialize.before_init` | Called before the pipeline components are initialized and receives the `nlp` object for in-place modification. Useful for modifying the tokenizer settings, similar to the v2 base model option. | +| `initialize.after_init` | Called after the pipeline components are initialized and receives the `nlp` object for in-place modification. | The `@spacy.registry.callbacks` decorator lets you register your custom function in the `callbacks` [registry](/api/top-level#registry) under a given name. You @@ -626,8 +628,8 @@ can then reference the function in a config block using the `@callbacks` key. If a block contains a key starting with an `@`, it's interpreted as a reference to a function. Because you've registered the function, spaCy knows how to create it when you reference `"customize_language_data"` in your config. Here's an example -of a callback that runs before the `nlp` object is created and adds a few custom -tokenization rules to the defaults: +of a callback that runs before the `nlp` object is created and adds a custom +stop word to the defaults: > #### config.cfg > @@ -643,7 +645,7 @@ import spacy @spacy.registry.callbacks("customize_language_data") def create_callback(): def customize_language_data(lang_cls): - lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",) + lang_cls.Defaults.stop_words.add("good") return lang_cls return customize_language_data @@ -674,17 +676,16 @@ we're adding the arguments `extra_stop_words` (a list of strings) and `debug` > ``` ```python -### functions.py {highlight="5,8-10"} +### functions.py {highlight="5,7-9"} from typing import List import spacy @spacy.registry.callbacks("customize_language_data") def create_callback(extra_stop_words: List[str] = [], debug: bool = False): def customize_language_data(lang_cls): - lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",) - lang_cls.Defaults.stop_words.add(extra_stop_words) + lang_cls.Defaults.stop_words.update(extra_stop_words) if debug: - print("Updated stop words and tokenizer suffixes") + print("Updated stop words") return lang_cls return customize_language_data @@ -715,6 +716,65 @@ to your Python file. Before loading the config, spaCy will import the $ python -m spacy train config.cfg --output ./output --code ./functions.py ``` +#### Example: Modifying tokenizer settings {#custom-tokenizer} + +Use the `initialize.before_init` callback to modify the tokenizer settings when +training a new pipeline. Write a registered callback that modifies the tokenizer +settings and specify this callback in your config: + +> #### config.cfg +> +> ```ini +> [initialize] +> +> [initialize.before_init] +> @callbacks = "customize_tokenizer" +> ``` + +```python +### functions.py +from spacy.util import registry, compile_suffix_regex + +@registry.callbacks("customize_tokenizer") +def make_customize_tokenizer(): + def customize_tokenizer(nlp): + # remove a suffix + suffixes = list(nlp.Defaults.suffixes) + suffixes.remove("\\[") + suffix_regex = compile_suffix_regex(suffixes) + nlp.tokenizer.suffix_search = suffix_regex.search + + # add a special case + nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}]) + return customize_tokenizer +``` + +When training, provide the function above with the `--code` option: + +```cli +$ python -m spacy train config.cfg --code ./functions.py +``` + +Because this callback is only called in the one-time initialization step before +training, the callback code does not need to be packaged with the final pipeline +package. However, to make it easier for others to replicate your training setup, +you can choose to package the initialization callbacks with the pipeline package +or to publish them separately. + + + +- `nlp.before_creation` is the best place to modify language defaults other than + the tokenizer settings. +- `initialize.before_init` is the best place to modify tokenizer settings when + training a new pipeline. + +Unlike the other language defaults, the tokenizer settings are saved with the +pipeline with `nlp.to_disk()`, so modifications made in `nlp.before_creation` +will be clobbered by the saved settings when the trained pipeline is loaded from +disk. + + + #### Example: Custom logging function {#custom-logging} During training, the results of each step are passed to a logger function. By diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 47ddcf53a..9b911b960 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -930,6 +930,55 @@ treebank. +#### Modifying tokenizer settings + +If you were using a base model with `spacy train` to customize the tokenizer +settings in v2, your modifications can be provided in the +`[initialize.before_init]` callback. + +Write a registered callback that modifies the tokenizer settings and specify +this callback in your config: + +> #### config.cfg +> +> ```ini +> [initialize] +> +> [initialize.before_init] +> @callbacks = "customize_tokenizer" +> ``` + +```python +### functions.py +from spacy.util import registry, compile_suffix_regex + +@registry.callbacks("customize_tokenizer") +def make_customize_tokenizer(): + def customize_tokenizer(nlp): + # remove a suffix + suffixes = list(nlp.Defaults.suffixes) + suffixes.remove("\\[") + suffix_regex = compile_suffix_regex(suffixes) + nlp.tokenizer.suffix_search = suffix_regex.search + + # add a special case + nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}]) + return customize_tokenizer +``` + +When training, provide the function above with the `--code` option: + +```cli +$ python -m spacy train config.cfg --code ./functions.py +``` + +The train step requires the `--code` option with your registered functions from +the `[initialize]` block, but since those callbacks are only required during the +initialization step, you don't need to provide them with the final pipeline +package. However, to make it easier for others to replicate your training setup, +you can choose to package the initialization callbacks with the pipeline package +or to publish them separately. + #### Training via the Python API {#migrating-training-python} For most use cases, you **shouldn't** have to write your own training scripts