diff --git a/spacy/errors.py b/spacy/errors.py index 453e98b59..7cf9e54e4 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -501,6 +501,9 @@ class Errors: E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.") # New errors added in v3.x + E872 = ("Unable to copy tokenizer from base model due to different " + 'tokenizer settings: current tokenizer config "{curr_config}" ' + 'vs. base model "{base_config}"') E873 = ("Unable to merge a span from doc.spans with key '{key}' and text " "'{text}'. This is likely a bug in spaCy, so feel free to open an " "issue: https://github.com/explosion/spaCy/issues") diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py index 5111b80dc..055f30f42 100644 --- a/spacy/training/__init__.py +++ b/spacy/training/__init__.py @@ -8,3 +8,4 @@ from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401 from .gold_io import docs_to_json, read_json_file # noqa: F401 from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401 from .loggers import console_logger, wandb_logger # noqa: F401 +from .callbacks import create_copy_from_base_model # noqa: F401 diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py new file mode 100644 index 000000000..2a21be98c --- /dev/null +++ b/spacy/training/callbacks.py @@ -0,0 +1,32 @@ +from typing import Optional +from ..errors import Errors +from ..language import Language +from ..util import load_model, registry, logger + + +@registry.callbacks("spacy.copy_from_base_model.v1") +def create_copy_from_base_model( + tokenizer: Optional[str] = None, + vocab: Optional[str] = None, +) -> Language: + def copy_from_base_model(nlp): + if tokenizer: + logger.info(f"Copying tokenizer from: {tokenizer}") + base_nlp = load_model(tokenizer) + if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]: + nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"])) + else: + raise ValueError( + Errors.E872.format( + curr_config=nlp.config["nlp"]["tokenizer"], + base_config=base_nlp.config["nlp"]["tokenizer"], + ) + ) + if vocab: + logger.info(f"Copying vocab from: {vocab}") + # only reload if the vocab is from a different model + if tokenizer != vocab: + base_nlp = load_model(vocab) + nlp.vocab.from_bytes(base_nlp.vocab.to_bytes()) + + return copy_from_base_model diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 38bc40b11..cfaa75bff 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -8,6 +8,7 @@ menu: - ['Readers', 'readers'] - ['Batchers', 'batchers'] - ['Augmenters', 'augmenters'] + - ['Callbacks', 'callbacks'] - ['Training & Alignment', 'gold'] - ['Utility Functions', 'util'] --- @@ -785,6 +786,35 @@ useful for making the model less sensitive to capitalization. | `level` | The percentage of texts that will be augmented. ~~float~~ | | **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ | +## Callbacks {#callbacks source="spacy/training/callbacks.py" new="3"} + +The config supports [callbacks](/usage/training#custom-code-nlp-callbacks) at +several points in the lifecycle that can be used modify the `nlp` object. + +### spacy.copy_from_base_model.v1 {#copy_from_base_model tag="registered function"} + +> #### Example config +> +> ```ini +> [initialize.before_init] +> @callbacks = "spacy.copy_from_base_model.v1" +> tokenizer = "en_core_sci_md" +> vocab = "en_core_sci_md" +> ``` + +Copy the tokenizer and/or vocab from the specified models. It's similar to the +v2 [base model](https://v2.spacy.io/api/cli#train) option and useful in +combination with +[sourced components](/usage/processing-pipelines#sourced-components) when +fine-tuning an existing pipeline. The vocab includes the lookups and the vectors +from the specified model. Intended for use in `[initialize.before_init]`. + +| Name | Description | +| ----------- | ----------------------------------------------------------------------------------------------------------------------- | +| `tokenizer` | The pipeline to copy the tokenizer from. Defaults to `None`. ~~Optional[str]~~ | +| `vocab` | The pipeline to copy the vocab from. The vocab includes the lookups and vectors. Defaults to `None`. ~~Optional[str]~~ | +| **CREATES** | A function that takes the current `nlp` object and modifies its `tokenizer` and `vocab`. ~~Callable[[Language], None]~~ | + ## Training data and alignment {#gold source="spacy/training"} ### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}