Merge remote-tracking branch 'upstream/develop' into feature/missing-dep

This commit is contained in:
svlandeg 2021-01-13 17:46:12 +01:00
commit fec9b81aa2
11 changed files with 229 additions and 41 deletions

View File

@ -124,3 +124,5 @@ lookups = null
tokenizer = {}
# Arguments for initialize methods of the components (keyed by component)
components = {}
before_init = null
after_init = null

View File

@ -1209,6 +1209,9 @@ class Language:
config = self.config.interpolate()
# These are the settings provided in the [initialize] block in the config
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
before_init = I["before_init"]
if before_init is not None:
before_init(self)
init_vocab(
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
)
@ -1240,6 +1243,9 @@ class Language:
self._optimizer = sgd
elif self._optimizer is None:
self._optimizer = self.create_optimizer()
after_init = I["after_init"]
if after_init is not None:
after_init(self)
return self._optimizer
def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:

View File

@ -256,8 +256,14 @@ class Tagger(TrainablePipe):
DOCS: https://nightly.spacy.io/api/tagger#get_loss
"""
validate_examples(examples, "Tagger.get_loss")
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, missing_value="")
truths = [eg.get_aligned("TAG", as_string=True) for eg in examples]
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
# Convert empty tag "" to missing value None so that both misaligned
# tokens and tokens with missing annotation have the default missing
# value None.
truths = []
for eg in examples:
eg_truths = [tag if tag is not "" else None for tag in eg.get_aligned("TAG", as_string=True)]
truths.append(eg_truths)
d_scores, loss = loss_func(scores, truths)
if self.model.ops.xp.isnan(loss):
raise ValueError(Errors.E910.format(name=self.name))

View File

@ -369,6 +369,8 @@ class ConfigSchemaInit(BaseModel):
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component")
before_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object before initialization")
after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization")
# fmt: on
class Config:

View File

@ -37,7 +37,16 @@ TRAIN_DATA = [
]
PARTIAL_DATA = [
# partial annotation
("I like green eggs", {"tags": ["", "V", "J", ""]}),
# misaligned partial annotation
(
"He hates green eggs",
{
"words": ["He", "hate", "s", "green", "eggs"],
"tags": ["", "V", "S", "J", ""],
},
),
]

View File

@ -8,6 +8,7 @@ from spacy.util import registry, load_model_from_config, load_config
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
from catalogue import RegistryError
from ..util import make_tempdir
@ -446,3 +447,21 @@ def test_config_validate_literal(parser_config_string):
nlp.add_pipe("parser", config=config)
config["model"]["state_type"] = "ner"
nlp.add_pipe("parser", config=config)
def test_config_only_resolve_relevant_blocks():
"""Test that only the relevant blocks are resolved in the different methods
and that invalid blocks are ignored if needed. For instance, the [initialize]
shouldn't be resolved at runtime.
"""
nlp = English()
config = nlp.config
config["training"]["before_to_disk"] = {"@misc": "nonexistent"}
config["initialize"]["lookups"] = {"@misc": "nonexistent"}
# This shouldn't resolve [training] or [initialize]
nlp = load_model_from_config(config, auto_fill=True)
# This will raise for nonexistent value
with pytest.raises(RegistryError):
nlp.initialize()
nlp.config["initialize"]["lookups"] = None
nlp.initialize()

View File

@ -166,6 +166,8 @@ def test_language_from_config_before_after_init():
ran_before = False
ran_after = False
ran_after_pipeline = False
ran_before_init = False
ran_after_init = False
@registry.callbacks(f"{name}_before")
def make_before_creation():
@ -205,6 +207,26 @@ def test_language_from_config_before_after_init():
return after_pipeline_creation
@registry.callbacks(f"{name}_before_init")
def make_before_init():
def before_init(nlp):
nonlocal ran_before_init
ran_before_init = True
nlp.meta["before_init"] = "before"
return nlp
return before_init
@registry.callbacks(f"{name}_after_init")
def make_after_init():
def after_init(nlp):
nonlocal ran_after_init
ran_after_init = True
nlp.meta["after_init"] = "after"
return nlp
return after_init
config = {
"nlp": {
"pipeline": ["sentencizer"],
@ -213,14 +235,23 @@ def test_language_from_config_before_after_init():
"after_pipeline_creation": {"@callbacks": f"{name}_after_pipeline"},
},
"components": {"sentencizer": {"factory": "sentencizer"}},
"initialize": {
"before_init": {"@callbacks": f"{name}_before_init"},
"after_init": {"@callbacks": f"{name}_after_init"},
},
}
nlp = English.from_config(config)
assert all([ran_before, ran_after, ran_after_pipeline])
assert nlp.Defaults.foo == "bar"
assert nlp.meta["foo"] == "bar"
assert nlp.meta["bar"] == "baz"
assert "before_init" not in nlp.meta
assert "after_init" not in nlp.meta
assert nlp.pipe_names == ["sentencizer"]
assert nlp("text")
nlp.initialize()
assert nlp.meta["before_init"] == "before"
assert nlp.meta["after_init"] == "after"
assert all([ran_before, ran_after, ran_after_pipeline, ran_before_init, ran_after_init])
def test_language_from_config_before_after_init_invalid():

View File

@ -59,6 +59,19 @@ def train(
batcher = T["batcher"]
train_logger = T["logger"]
before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
# Helper function to save checkpoints. This is a closure for convenience,
# to avoid passing in all the args all the time.
def save_checkpoint(is_best):
with nlp.use_params(optimizer.averages):
before_to_disk(nlp).to_disk(output_path / DIR_MODEL_LAST)
if is_best:
# Avoid saving twice (saving will be more expensive than
# the dir copy)
if (output_path / DIR_MODEL_BEST).exists():
shutil.rmtree(output_path / DIR_MODEL_BEST)
shutil.copytree(output_path / DIR_MODEL_LAST, output_path / DIR_MODEL_BEST)
# Components that shouldn't be updated during training
frozen_components = T["frozen_components"]
# Create iterator, which yields out info after each optimization step.
@ -87,40 +100,31 @@ def train(
if is_best_checkpoint is not None and output_path is not None:
with nlp.select_pipes(disable=frozen_components):
update_meta(T, nlp, info)
with nlp.use_params(optimizer.averages):
nlp = before_to_disk(nlp)
nlp.to_disk(output_path / DIR_MODEL_LAST)
if is_best_checkpoint:
with nlp.use_params(optimizer.averages):
nlp.to_disk(output_path / DIR_MODEL_BEST)
save_checkpoint(is_best_checkpoint)
except Exception as e:
if output_path is not None:
# We don't want to swallow the traceback if we don't have a
# specific error, but we do want to warn that we're trying
# to do something here.
stdout.write(
msg.warn(
f"Aborting and saving the final best model. "
f"Encountered exception: {str(e)}"
f"Encountered exception: {repr(e)}"
)
+ "\n"
)
raise e
finally:
finalize_logger()
if optimizer.averages:
nlp.use_params(optimizer.averages)
if output_path is not None:
final_model_path = output_path / DIR_MODEL_LAST
nlp.to_disk(final_model_path)
# This will only run if we don't hit an error
stdout.write(
msg.good("Saved pipeline to output directory", final_model_path) + "\n"
)
return (nlp, final_model_path)
else:
return (nlp, None)
save_checkpoint(False)
# This will only run if we did't hit an error
if optimizer.averages:
nlp.use_params(optimizer.averages)
if output_path is not None:
stdout.write(
msg.good("Saved pipeline to output directory", output_path / DIR_MODEL_LAST)
+ "\n"
)
return (nlp, output_path / DIR_MODEL_LAST)
else:
return (nlp, None)
def train_while_improving(

View File

@ -716,7 +716,7 @@ that we want to classify as being related or not. As these candidate pairs are
typically formed within one document, this function takes a [`Doc`](/api/doc) as
input and outputs a `List` of `Span` tuples. For instance, the following
implementation takes any two entities from the same document, as long as they
are within a **maximum distance** (in number of tokens) of eachother:
are within a **maximum distance** (in number of tokens) of each other:
> #### config.cfg (excerpt)
>
@ -742,7 +742,7 @@ def create_instances(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]
return get_candidates
```
This function in added to the [`@misc` registry](/api/top-level#registry) so we
This function is added to the [`@misc` registry](/api/top-level#registry) so we
can refer to it from the config, and easily swap it out for any other candidate
generation function.

View File

@ -611,14 +611,16 @@ subclass and language data from scratch it's often enough to make a few smal
modifications, like adjusting the
[tokenization rules](/usage/linguistic-features#native-tokenizer-additions) or
[language defaults](/api/language#defaults) like stop words. The config lets you
provide three optional **callback functions** that give you access to the
provide five optional **callback functions** that give you access to the
language class and `nlp` object at different points of the lifecycle:
| Callback | Description |
| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `before_creation` | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults). |
| `after_creation` | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. Useful for modifying the tokenizer. |
| `after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components. |
| Callback | Description |
| ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nlp.before_creation` | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults) aside from the tokenizer settings. |
| `nlp.after_creation` | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. |
| `nlp.after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components. |
| `initialize.before_init` | Called before the pipeline components are initialized and receives the `nlp` object for in-place modification. Useful for modifying the tokenizer settings, similar to the v2 base model option. |
| `initialize.after_init` | Called after the pipeline components are initialized and receives the `nlp` object for in-place modification. |
The `@spacy.registry.callbacks` decorator lets you register your custom function
in the `callbacks` [registry](/api/top-level#registry) under a given name. You
@ -626,8 +628,8 @@ can then reference the function in a config block using the `@callbacks` key. If
a block contains a key starting with an `@`, it's interpreted as a reference to
a function. Because you've registered the function, spaCy knows how to create it
when you reference `"customize_language_data"` in your config. Here's an example
of a callback that runs before the `nlp` object is created and adds a few custom
tokenization rules to the defaults:
of a callback that runs before the `nlp` object is created and adds a custom
stop word to the defaults:
> #### config.cfg
>
@ -643,7 +645,7 @@ import spacy
@spacy.registry.callbacks("customize_language_data")
def create_callback():
def customize_language_data(lang_cls):
lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
lang_cls.Defaults.stop_words.add("good")
return lang_cls
return customize_language_data
@ -674,17 +676,16 @@ we're adding the arguments `extra_stop_words` (a list of strings) and `debug`
> ```
```python
### functions.py {highlight="5,8-10"}
### functions.py {highlight="5,7-9"}
from typing import List
import spacy
@spacy.registry.callbacks("customize_language_data")
def create_callback(extra_stop_words: List[str] = [], debug: bool = False):
def customize_language_data(lang_cls):
lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
lang_cls.Defaults.stop_words.add(extra_stop_words)
lang_cls.Defaults.stop_words.update(extra_stop_words)
if debug:
print("Updated stop words and tokenizer suffixes")
print("Updated stop words")
return lang_cls
return customize_language_data
@ -715,6 +716,65 @@ to your Python file. Before loading the config, spaCy will import the
$ python -m spacy train config.cfg --output ./output --code ./functions.py
```
#### Example: Modifying tokenizer settings {#custom-tokenizer}
Use the `initialize.before_init` callback to modify the tokenizer settings when
training a new pipeline. Write a registered callback that modifies the tokenizer
settings and specify this callback in your config:
> #### config.cfg
>
> ```ini
> [initialize]
>
> [initialize.before_init]
> @callbacks = "customize_tokenizer"
> ```
```python
### functions.py
from spacy.util import registry, compile_suffix_regex
@registry.callbacks("customize_tokenizer")
def make_customize_tokenizer():
def customize_tokenizer(nlp):
# remove a suffix
suffixes = list(nlp.Defaults.suffixes)
suffixes.remove("\\[")
suffix_regex = compile_suffix_regex(suffixes)
nlp.tokenizer.suffix_search = suffix_regex.search
# add a special case
nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
return customize_tokenizer
```
When training, provide the function above with the `--code` option:
```cli
$ python -m spacy train config.cfg --code ./functions.py
```
Because this callback is only called in the one-time initialization step before
training, the callback code does not need to be packaged with the final pipeline
package. However, to make it easier for others to replicate your training setup,
you can choose to package the initialization callbacks with the pipeline package
or to publish them separately.
<Infobox variant="warning" title="nlp.before_creation vs. initialize.before_init">
- `nlp.before_creation` is the best place to modify language defaults other than
the tokenizer settings.
- `initialize.before_init` is the best place to modify tokenizer settings when
training a new pipeline.
Unlike the other language defaults, the tokenizer settings are saved with the
pipeline with `nlp.to_disk()`, so modifications made in `nlp.before_creation`
will be clobbered by the saved settings when the trained pipeline is loaded from
disk.
</Infobox>
#### Example: Custom logging function {#custom-logging}
During training, the results of each step are passed to a logger function. By

View File

@ -930,6 +930,55 @@ treebank.
</Project>
#### Modifying tokenizer settings
If you were using a base model with `spacy train` to customize the tokenizer
settings in v2, your modifications can be provided in the
`[initialize.before_init]` callback.
Write a registered callback that modifies the tokenizer settings and specify
this callback in your config:
> #### config.cfg
>
> ```ini
> [initialize]
>
> [initialize.before_init]
> @callbacks = "customize_tokenizer"
> ```
```python
### functions.py
from spacy.util import registry, compile_suffix_regex
@registry.callbacks("customize_tokenizer")
def make_customize_tokenizer():
def customize_tokenizer(nlp):
# remove a suffix
suffixes = list(nlp.Defaults.suffixes)
suffixes.remove("\\[")
suffix_regex = compile_suffix_regex(suffixes)
nlp.tokenizer.suffix_search = suffix_regex.search
# add a special case
nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
return customize_tokenizer
```
When training, provide the function above with the `--code` option:
```cli
$ python -m spacy train config.cfg --code ./functions.py
```
The train step requires the `--code` option with your registered functions from
the `[initialize]` block, but since those callbacks are only required during the
initialization step, you don't need to provide them with the final pipeline
package. However, to make it easier for others to replicate your training setup,
you can choose to package the initialization callbacks with the pipeline package
or to publish them separately.
#### Training via the Python API {#migrating-training-python}
For most use cases, you **shouldn't** have to write your own training scripts