mirror of https://github.com/explosion/spaCy.git
Merge remote-tracking branch 'upstream/develop' into feature/missing-dep
This commit is contained in:
commit
fec9b81aa2
|
@ -124,3 +124,5 @@ lookups = null
|
|||
tokenizer = {}
|
||||
# Arguments for initialize methods of the components (keyed by component)
|
||||
components = {}
|
||||
before_init = null
|
||||
after_init = null
|
||||
|
|
|
@ -1209,6 +1209,9 @@ class Language:
|
|||
config = self.config.interpolate()
|
||||
# These are the settings provided in the [initialize] block in the config
|
||||
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||||
before_init = I["before_init"]
|
||||
if before_init is not None:
|
||||
before_init(self)
|
||||
init_vocab(
|
||||
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
|
||||
)
|
||||
|
@ -1240,6 +1243,9 @@ class Language:
|
|||
self._optimizer = sgd
|
||||
elif self._optimizer is None:
|
||||
self._optimizer = self.create_optimizer()
|
||||
after_init = I["after_init"]
|
||||
if after_init is not None:
|
||||
after_init(self)
|
||||
return self._optimizer
|
||||
|
||||
def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
|
||||
|
|
|
@ -256,8 +256,14 @@ class Tagger(TrainablePipe):
|
|||
DOCS: https://nightly.spacy.io/api/tagger#get_loss
|
||||
"""
|
||||
validate_examples(examples, "Tagger.get_loss")
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, missing_value="")
|
||||
truths = [eg.get_aligned("TAG", as_string=True) for eg in examples]
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||
# Convert empty tag "" to missing value None so that both misaligned
|
||||
# tokens and tokens with missing annotation have the default missing
|
||||
# value None.
|
||||
truths = []
|
||||
for eg in examples:
|
||||
eg_truths = [tag if tag is not "" else None for tag in eg.get_aligned("TAG", as_string=True)]
|
||||
truths.append(eg_truths)
|
||||
d_scores, loss = loss_func(scores, truths)
|
||||
if self.model.ops.xp.isnan(loss):
|
||||
raise ValueError(Errors.E910.format(name=self.name))
|
||||
|
|
|
@ -369,6 +369,8 @@ class ConfigSchemaInit(BaseModel):
|
|||
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
||||
tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
|
||||
components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component")
|
||||
before_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object before initialization")
|
||||
after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization")
|
||||
# fmt: on
|
||||
|
||||
class Config:
|
||||
|
|
|
@ -37,7 +37,16 @@ TRAIN_DATA = [
|
|||
]
|
||||
|
||||
PARTIAL_DATA = [
|
||||
# partial annotation
|
||||
("I like green eggs", {"tags": ["", "V", "J", ""]}),
|
||||
# misaligned partial annotation
|
||||
(
|
||||
"He hates green eggs",
|
||||
{
|
||||
"words": ["He", "hate", "s", "green", "eggs"],
|
||||
"tags": ["", "V", "S", "J", ""],
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -8,6 +8,7 @@ from spacy.util import registry, load_model_from_config, load_config
|
|||
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
|
||||
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
|
||||
from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
|
||||
from catalogue import RegistryError
|
||||
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
@ -446,3 +447,21 @@ def test_config_validate_literal(parser_config_string):
|
|||
nlp.add_pipe("parser", config=config)
|
||||
config["model"]["state_type"] = "ner"
|
||||
nlp.add_pipe("parser", config=config)
|
||||
|
||||
|
||||
def test_config_only_resolve_relevant_blocks():
|
||||
"""Test that only the relevant blocks are resolved in the different methods
|
||||
and that invalid blocks are ignored if needed. For instance, the [initialize]
|
||||
shouldn't be resolved at runtime.
|
||||
"""
|
||||
nlp = English()
|
||||
config = nlp.config
|
||||
config["training"]["before_to_disk"] = {"@misc": "nonexistent"}
|
||||
config["initialize"]["lookups"] = {"@misc": "nonexistent"}
|
||||
# This shouldn't resolve [training] or [initialize]
|
||||
nlp = load_model_from_config(config, auto_fill=True)
|
||||
# This will raise for nonexistent value
|
||||
with pytest.raises(RegistryError):
|
||||
nlp.initialize()
|
||||
nlp.config["initialize"]["lookups"] = None
|
||||
nlp.initialize()
|
||||
|
|
|
@ -166,6 +166,8 @@ def test_language_from_config_before_after_init():
|
|||
ran_before = False
|
||||
ran_after = False
|
||||
ran_after_pipeline = False
|
||||
ran_before_init = False
|
||||
ran_after_init = False
|
||||
|
||||
@registry.callbacks(f"{name}_before")
|
||||
def make_before_creation():
|
||||
|
@ -205,6 +207,26 @@ def test_language_from_config_before_after_init():
|
|||
|
||||
return after_pipeline_creation
|
||||
|
||||
@registry.callbacks(f"{name}_before_init")
|
||||
def make_before_init():
|
||||
def before_init(nlp):
|
||||
nonlocal ran_before_init
|
||||
ran_before_init = True
|
||||
nlp.meta["before_init"] = "before"
|
||||
return nlp
|
||||
|
||||
return before_init
|
||||
|
||||
@registry.callbacks(f"{name}_after_init")
|
||||
def make_after_init():
|
||||
def after_init(nlp):
|
||||
nonlocal ran_after_init
|
||||
ran_after_init = True
|
||||
nlp.meta["after_init"] = "after"
|
||||
return nlp
|
||||
|
||||
return after_init
|
||||
|
||||
config = {
|
||||
"nlp": {
|
||||
"pipeline": ["sentencizer"],
|
||||
|
@ -213,14 +235,23 @@ def test_language_from_config_before_after_init():
|
|||
"after_pipeline_creation": {"@callbacks": f"{name}_after_pipeline"},
|
||||
},
|
||||
"components": {"sentencizer": {"factory": "sentencizer"}},
|
||||
"initialize": {
|
||||
"before_init": {"@callbacks": f"{name}_before_init"},
|
||||
"after_init": {"@callbacks": f"{name}_after_init"},
|
||||
},
|
||||
}
|
||||
nlp = English.from_config(config)
|
||||
assert all([ran_before, ran_after, ran_after_pipeline])
|
||||
assert nlp.Defaults.foo == "bar"
|
||||
assert nlp.meta["foo"] == "bar"
|
||||
assert nlp.meta["bar"] == "baz"
|
||||
assert "before_init" not in nlp.meta
|
||||
assert "after_init" not in nlp.meta
|
||||
assert nlp.pipe_names == ["sentencizer"]
|
||||
assert nlp("text")
|
||||
nlp.initialize()
|
||||
assert nlp.meta["before_init"] == "before"
|
||||
assert nlp.meta["after_init"] == "after"
|
||||
assert all([ran_before, ran_after, ran_after_pipeline, ran_before_init, ran_after_init])
|
||||
|
||||
|
||||
def test_language_from_config_before_after_init_invalid():
|
||||
|
|
|
@ -59,6 +59,19 @@ def train(
|
|||
batcher = T["batcher"]
|
||||
train_logger = T["logger"]
|
||||
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
|
||||
|
||||
# Helper function to save checkpoints. This is a closure for convenience,
|
||||
# to avoid passing in all the args all the time.
|
||||
def save_checkpoint(is_best):
|
||||
with nlp.use_params(optimizer.averages):
|
||||
before_to_disk(nlp).to_disk(output_path / DIR_MODEL_LAST)
|
||||
if is_best:
|
||||
# Avoid saving twice (saving will be more expensive than
|
||||
# the dir copy)
|
||||
if (output_path / DIR_MODEL_BEST).exists():
|
||||
shutil.rmtree(output_path / DIR_MODEL_BEST)
|
||||
shutil.copytree(output_path / DIR_MODEL_LAST, output_path / DIR_MODEL_BEST)
|
||||
|
||||
# Components that shouldn't be updated during training
|
||||
frozen_components = T["frozen_components"]
|
||||
# Create iterator, which yields out info after each optimization step.
|
||||
|
@ -87,40 +100,31 @@ def train(
|
|||
if is_best_checkpoint is not None and output_path is not None:
|
||||
with nlp.select_pipes(disable=frozen_components):
|
||||
update_meta(T, nlp, info)
|
||||
with nlp.use_params(optimizer.averages):
|
||||
nlp = before_to_disk(nlp)
|
||||
nlp.to_disk(output_path / DIR_MODEL_LAST)
|
||||
if is_best_checkpoint:
|
||||
with nlp.use_params(optimizer.averages):
|
||||
nlp.to_disk(output_path / DIR_MODEL_BEST)
|
||||
|
||||
save_checkpoint(is_best_checkpoint)
|
||||
except Exception as e:
|
||||
if output_path is not None:
|
||||
# We don't want to swallow the traceback if we don't have a
|
||||
# specific error, but we do want to warn that we're trying
|
||||
# to do something here.
|
||||
stdout.write(
|
||||
msg.warn(
|
||||
f"Aborting and saving the final best model. "
|
||||
f"Encountered exception: {str(e)}"
|
||||
f"Encountered exception: {repr(e)}"
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
raise e
|
||||
finally:
|
||||
finalize_logger()
|
||||
if optimizer.averages:
|
||||
nlp.use_params(optimizer.averages)
|
||||
if output_path is not None:
|
||||
final_model_path = output_path / DIR_MODEL_LAST
|
||||
nlp.to_disk(final_model_path)
|
||||
# This will only run if we don't hit an error
|
||||
stdout.write(
|
||||
msg.good("Saved pipeline to output directory", final_model_path) + "\n"
|
||||
)
|
||||
return (nlp, final_model_path)
|
||||
else:
|
||||
return (nlp, None)
|
||||
save_checkpoint(False)
|
||||
# This will only run if we did't hit an error
|
||||
if optimizer.averages:
|
||||
nlp.use_params(optimizer.averages)
|
||||
if output_path is not None:
|
||||
stdout.write(
|
||||
msg.good("Saved pipeline to output directory", output_path / DIR_MODEL_LAST)
|
||||
+ "\n"
|
||||
)
|
||||
return (nlp, output_path / DIR_MODEL_LAST)
|
||||
else:
|
||||
return (nlp, None)
|
||||
|
||||
|
||||
def train_while_improving(
|
||||
|
|
|
@ -716,7 +716,7 @@ that we want to classify as being related or not. As these candidate pairs are
|
|||
typically formed within one document, this function takes a [`Doc`](/api/doc) as
|
||||
input and outputs a `List` of `Span` tuples. For instance, the following
|
||||
implementation takes any two entities from the same document, as long as they
|
||||
are within a **maximum distance** (in number of tokens) of eachother:
|
||||
are within a **maximum distance** (in number of tokens) of each other:
|
||||
|
||||
> #### config.cfg (excerpt)
|
||||
>
|
||||
|
@ -742,7 +742,7 @@ def create_instances(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]
|
|||
return get_candidates
|
||||
```
|
||||
|
||||
This function in added to the [`@misc` registry](/api/top-level#registry) so we
|
||||
This function is added to the [`@misc` registry](/api/top-level#registry) so we
|
||||
can refer to it from the config, and easily swap it out for any other candidate
|
||||
generation function.
|
||||
|
||||
|
|
|
@ -611,14 +611,16 @@ subclass and language data from scratch – it's often enough to make a few smal
|
|||
modifications, like adjusting the
|
||||
[tokenization rules](/usage/linguistic-features#native-tokenizer-additions) or
|
||||
[language defaults](/api/language#defaults) like stop words. The config lets you
|
||||
provide three optional **callback functions** that give you access to the
|
||||
provide five optional **callback functions** that give you access to the
|
||||
language class and `nlp` object at different points of the lifecycle:
|
||||
|
||||
| Callback | Description |
|
||||
| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `before_creation` | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults). |
|
||||
| `after_creation` | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. Useful for modifying the tokenizer. |
|
||||
| `after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components. |
|
||||
| Callback | Description |
|
||||
| ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `nlp.before_creation` | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults) aside from the tokenizer settings. |
|
||||
| `nlp.after_creation` | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. |
|
||||
| `nlp.after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components. |
|
||||
| `initialize.before_init` | Called before the pipeline components are initialized and receives the `nlp` object for in-place modification. Useful for modifying the tokenizer settings, similar to the v2 base model option. |
|
||||
| `initialize.after_init` | Called after the pipeline components are initialized and receives the `nlp` object for in-place modification. |
|
||||
|
||||
The `@spacy.registry.callbacks` decorator lets you register your custom function
|
||||
in the `callbacks` [registry](/api/top-level#registry) under a given name. You
|
||||
|
@ -626,8 +628,8 @@ can then reference the function in a config block using the `@callbacks` key. If
|
|||
a block contains a key starting with an `@`, it's interpreted as a reference to
|
||||
a function. Because you've registered the function, spaCy knows how to create it
|
||||
when you reference `"customize_language_data"` in your config. Here's an example
|
||||
of a callback that runs before the `nlp` object is created and adds a few custom
|
||||
tokenization rules to the defaults:
|
||||
of a callback that runs before the `nlp` object is created and adds a custom
|
||||
stop word to the defaults:
|
||||
|
||||
> #### config.cfg
|
||||
>
|
||||
|
@ -643,7 +645,7 @@ import spacy
|
|||
@spacy.registry.callbacks("customize_language_data")
|
||||
def create_callback():
|
||||
def customize_language_data(lang_cls):
|
||||
lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
|
||||
lang_cls.Defaults.stop_words.add("good")
|
||||
return lang_cls
|
||||
|
||||
return customize_language_data
|
||||
|
@ -674,17 +676,16 @@ we're adding the arguments `extra_stop_words` (a list of strings) and `debug`
|
|||
> ```
|
||||
|
||||
```python
|
||||
### functions.py {highlight="5,8-10"}
|
||||
### functions.py {highlight="5,7-9"}
|
||||
from typing import List
|
||||
import spacy
|
||||
|
||||
@spacy.registry.callbacks("customize_language_data")
|
||||
def create_callback(extra_stop_words: List[str] = [], debug: bool = False):
|
||||
def customize_language_data(lang_cls):
|
||||
lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
|
||||
lang_cls.Defaults.stop_words.add(extra_stop_words)
|
||||
lang_cls.Defaults.stop_words.update(extra_stop_words)
|
||||
if debug:
|
||||
print("Updated stop words and tokenizer suffixes")
|
||||
print("Updated stop words")
|
||||
return lang_cls
|
||||
|
||||
return customize_language_data
|
||||
|
@ -715,6 +716,65 @@ to your Python file. Before loading the config, spaCy will import the
|
|||
$ python -m spacy train config.cfg --output ./output --code ./functions.py
|
||||
```
|
||||
|
||||
#### Example: Modifying tokenizer settings {#custom-tokenizer}
|
||||
|
||||
Use the `initialize.before_init` callback to modify the tokenizer settings when
|
||||
training a new pipeline. Write a registered callback that modifies the tokenizer
|
||||
settings and specify this callback in your config:
|
||||
|
||||
> #### config.cfg
|
||||
>
|
||||
> ```ini
|
||||
> [initialize]
|
||||
>
|
||||
> [initialize.before_init]
|
||||
> @callbacks = "customize_tokenizer"
|
||||
> ```
|
||||
|
||||
```python
|
||||
### functions.py
|
||||
from spacy.util import registry, compile_suffix_regex
|
||||
|
||||
@registry.callbacks("customize_tokenizer")
|
||||
def make_customize_tokenizer():
|
||||
def customize_tokenizer(nlp):
|
||||
# remove a suffix
|
||||
suffixes = list(nlp.Defaults.suffixes)
|
||||
suffixes.remove("\\[")
|
||||
suffix_regex = compile_suffix_regex(suffixes)
|
||||
nlp.tokenizer.suffix_search = suffix_regex.search
|
||||
|
||||
# add a special case
|
||||
nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
|
||||
return customize_tokenizer
|
||||
```
|
||||
|
||||
When training, provide the function above with the `--code` option:
|
||||
|
||||
```cli
|
||||
$ python -m spacy train config.cfg --code ./functions.py
|
||||
```
|
||||
|
||||
Because this callback is only called in the one-time initialization step before
|
||||
training, the callback code does not need to be packaged with the final pipeline
|
||||
package. However, to make it easier for others to replicate your training setup,
|
||||
you can choose to package the initialization callbacks with the pipeline package
|
||||
or to publish them separately.
|
||||
|
||||
<Infobox variant="warning" title="nlp.before_creation vs. initialize.before_init">
|
||||
|
||||
- `nlp.before_creation` is the best place to modify language defaults other than
|
||||
the tokenizer settings.
|
||||
- `initialize.before_init` is the best place to modify tokenizer settings when
|
||||
training a new pipeline.
|
||||
|
||||
Unlike the other language defaults, the tokenizer settings are saved with the
|
||||
pipeline with `nlp.to_disk()`, so modifications made in `nlp.before_creation`
|
||||
will be clobbered by the saved settings when the trained pipeline is loaded from
|
||||
disk.
|
||||
|
||||
</Infobox>
|
||||
|
||||
#### Example: Custom logging function {#custom-logging}
|
||||
|
||||
During training, the results of each step are passed to a logger function. By
|
||||
|
|
|
@ -930,6 +930,55 @@ treebank.
|
|||
|
||||
</Project>
|
||||
|
||||
#### Modifying tokenizer settings
|
||||
|
||||
If you were using a base model with `spacy train` to customize the tokenizer
|
||||
settings in v2, your modifications can be provided in the
|
||||
`[initialize.before_init]` callback.
|
||||
|
||||
Write a registered callback that modifies the tokenizer settings and specify
|
||||
this callback in your config:
|
||||
|
||||
> #### config.cfg
|
||||
>
|
||||
> ```ini
|
||||
> [initialize]
|
||||
>
|
||||
> [initialize.before_init]
|
||||
> @callbacks = "customize_tokenizer"
|
||||
> ```
|
||||
|
||||
```python
|
||||
### functions.py
|
||||
from spacy.util import registry, compile_suffix_regex
|
||||
|
||||
@registry.callbacks("customize_tokenizer")
|
||||
def make_customize_tokenizer():
|
||||
def customize_tokenizer(nlp):
|
||||
# remove a suffix
|
||||
suffixes = list(nlp.Defaults.suffixes)
|
||||
suffixes.remove("\\[")
|
||||
suffix_regex = compile_suffix_regex(suffixes)
|
||||
nlp.tokenizer.suffix_search = suffix_regex.search
|
||||
|
||||
# add a special case
|
||||
nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
|
||||
return customize_tokenizer
|
||||
```
|
||||
|
||||
When training, provide the function above with the `--code` option:
|
||||
|
||||
```cli
|
||||
$ python -m spacy train config.cfg --code ./functions.py
|
||||
```
|
||||
|
||||
The train step requires the `--code` option with your registered functions from
|
||||
the `[initialize]` block, but since those callbacks are only required during the
|
||||
initialization step, you don't need to provide them with the final pipeline
|
||||
package. However, to make it easier for others to replicate your training setup,
|
||||
you can choose to package the initialization callbacks with the pipeline package
|
||||
or to publish them separately.
|
||||
|
||||
#### Training via the Python API {#migrating-training-python}
|
||||
|
||||
For most use cases, you **shouldn't** have to write your own training scripts
|
||||
|
|
Loading…
Reference in New Issue