From a103ab5f1a038ccbd668e5e33d0bee2dabd75b4e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Sep 2020 23:03:47 +0200 Subject: [PATCH] Update augmenter lookups and docs --- spacy/errors.py | 6 ++++ spacy/tests/training/test_training.py | 14 ++++++-- spacy/training/augment.py | 51 ++++++++++++++++++++------- website/docs/api/corpus.md | 6 ++-- website/docs/api/top-level.md | 32 ++++++++++++++++- website/docs/usage/training.md | 44 ++++++++++++++++++++--- 6 files changed, 131 insertions(+), 22 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 233ff29bd..4ba51f669 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -477,6 +477,12 @@ class Errors: E201 = ("Span index out of range.") # TODO: fix numbering after merging develop into master + E912 = ("No orth_variants lookups table for data augmentation available for " + "language '{lang}'. If orth_variants are available in " + "spacy-lookups-data, make sure the package is installed and the " + "table is loaded in the [initialize.lookups] block of your config. " + "Alternatively, you can provide your own Lookups object with a " + "table orth_variants as the argument 'lookuos' of the augmenter.") E913 = ("Corpus path can't be None. Maybe you forgot to define it in your " "config.cfg or override it on the CLI?") E914 = ("Executing {name} callback failed. Expected the function to " diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 81e533a5a..af3fe63c2 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -7,6 +7,7 @@ from spacy.training.converters import json_to_docs from spacy.training.augment import create_orth_variants_augmenter from spacy.lang.en import English from spacy.tokens import Doc, DocBin +from spacy.lookups import Lookups from spacy.util import get_words_and_spaces, minibatch from thinc.api import compounding import pytest @@ -492,13 +493,20 @@ def test_roundtrip_docs_to_docbin(doc): @pytest.mark.filterwarnings("ignore::UserWarning") def test_make_orth_variants(doc): nlp = English() + orth_variants = { + "single": [ + {"tags": ["NFP"], "variants": ["…", "..."]}, + {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]}, + ] + } + lookups = Lookups() + lookups.add_table("orth_variants", orth_variants) + augmenter = create_orth_variants_augmenter(level=0.2, lower=0.5, lookups=lookups) with make_tempdir() as tmpdir: output_file = tmpdir / "roundtrip.spacy" DocBin(docs=[doc]).to_disk(output_file) # due to randomness, test only that this runs with no errors for now - reader = Corpus( - output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5) - ) + reader = Corpus(output_file, augmenter=augmenter) list(reader(nlp)) diff --git a/spacy/training/augment.py b/spacy/training/augment.py index 95662eafa..176530a1c 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -1,30 +1,50 @@ -from typing import Callable +from typing import Callable, Iterator, Dict, List, Tuple, Optional, TYPE_CHECKING import random import itertools import copy from functools import partial -from ..util import registry + +from ..util import registry, logger from ..tokens import Doc +from .example import Example +from ..lookups import Lookups +from ..errors import Errors - -@registry.augmenters("spacy.dont_augment.v1") -def create_null_augmenter(): - return dont_augment +if TYPE_CHECKING: + from ..language import Language # noqa: F401 @registry.augmenters("spacy.orth_variants.v1") -def create_orth_variants_augmenter(level: float, lower: float) -> Callable: +def create_orth_variants_augmenter( + level: float, lower: float, lookups: Optional[Lookups] = None, +) -> Callable[["Language", Example], Iterator[Example]]: """Create a data augmentation callback that uses orth-variant replacement. The callback can be added to a corpus or other data iterator during training. """ - return partial(orth_variants_augmenter, level=level, lower=lower) + return partial(orth_variants_augmenter, level=level, lower=lower, lookups=lookups) -def dont_augment(nlp, example): +def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]: yield example -def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float = 0.0): +def orth_variants_augmenter( + nlp: "Language", + example: Example, + *, + level: float = 0.0, + lower: float = 0.0, + lookups: Optional[Lookups] = None, +) -> Iterator[Example]: + table_name = "orth_variants" + if lookups is not None: + orth_variants = lookups.get_table(table_name, {}) + logger.debug("Using data augmentation orth variants from provided lookups") + else: + orth_variants = nlp.vocab.lookups.get_table(table_name, {}) + logger.debug("Using data augmentation orth variants from default vocab lookups") + if not orth_variants: + raise ValueError(Errors.E912.format(lang=nlp.lang)) if random.random() >= level: yield example else: @@ -37,6 +57,7 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float = nlp, raw_text, orig_dict["token_annotation"], + orth_variants, lower=raw_text is not None and random.random() < lower, ) if variant_text: @@ -49,9 +70,15 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float = yield example.from_dict(doc, orig_dict) -def make_orth_variants(nlp, raw, token_dict, *, lower: bool = False): +def make_orth_variants( + nlp: "Language", + raw: str, + token_dict: Dict[str, List[str]], + orth_variants: Dict[str, list], + *, + lower: bool = False, +) -> Tuple[str, Dict[str, List[str]]]: orig_token_dict = copy.deepcopy(token_dict) - orth_variants = nlp.vocab.lookups.get_table("orth_variants", {}) ndsv = orth_variants.get("single", []) ndpv = orth_variants.get("paired", []) words = token_dict.get("words", []) diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index 37f24819d..58006a19b 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -7,9 +7,11 @@ new: 3 --- This class manages annotated corpora and can be used for training and -development datasets in the [DocBin](/api/docbin) (`.spacy`) format. To +development datasets in the [`DocBin`](/api/docbin) (`.spacy`) format. To customize the data loading during training, you can register your own -[data readers and batchers](/usage/training#custom-code-readers-batchers). +[data readers and batchers](/usage/training#custom-code-readers-batchers). Also +see the usage guide on [data utilities](/usage/training#data) for more details +and examples. ## Config and implementation {#config} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 7f1b1ed7f..da24593e6 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -7,7 +7,8 @@ menu: - ['Loggers', 'loggers'] - ['Readers', 'readers'] - ['Batchers', 'batchers'] - - ['Data & Alignment', 'gold'] + - ['Augmenters', 'augmenters'] + - ['Training & Alignment', 'gold'] - ['Utility Functions', 'util'] --- @@ -313,6 +314,7 @@ factories. | Registry name | Description | | ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. | +| `augmenters` | Registry for functions that create [data augmentation](#augmenters) callbacks for corpora and other training data iterators. | | `batchers` | Registry for training and evaluation [data batchers](#batchers). | | `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. | | `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). | @@ -618,6 +620,34 @@ sequences in the batch. | `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~ | | `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ | +## Augmenters {#augmenters source="spacy/training/augment.py" new="3"} + + + +### orth_variants {#orth_variants tag="registered function"} + +> #### Example config +> +> ```ini +> [corpora.train.augmenter] +> @augmenters = "spacy.orth_variants.v1" +> level = 0.0 +> lower = 0.0 +> lookups = null +> ``` + +Create a data augmentation callback that uses orth-variant replacement. The +callback can be added to a corpus or other data iterator during training. This +is especially useful for punctuation and case replacement, to help generalize +beyond corpora that don't have smart quotes, or only have smart quotes etc. + +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `level` | ~~float~~ | +| `lower` | ~~float~~ | +| `lookups` | Lookups table containing the orth variants to use. See [`orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. If not set, tables from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) are used if available and added in the [`[initialize]`](/api/data-formats#config-initialize) block of the config. If no orth variants are found, spaCy will raise an error. Defaults to `None`. ~~Optional[Lookups]~~ | +| **RETURNS** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ | + ## Training data and alignment {#gold source="spacy/training"} ### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"} diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index c0658a58c..51aa82618 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -805,15 +805,30 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]: ## Data utilities {#data} -spaCy includes various features and utilities to make it easy to train from your -own data. If you have training data in a standard format like `.conll` or -`.conllu`, the easiest way to convert it for use with spaCy is to run -[`spacy convert`](/api/cli#convert) and pass it a file and an output directory: +spaCy includes various features and utilities to make it easy to train models +using your own data, manage training and evaluation corpora, convert existing +annotations and configure data augmentation strategies for more robust models. + +### Converting existing corpora and annotations {#data-convert} + +If you have training data in a standard format like `.conll` or `.conllu`, the +easiest way to convert it for use with spaCy is to run +[`spacy convert`](/api/cli#convert) and pass it a file and an output directory. +By default, the command will pick the converter based on the file extension. ```cli $ python -m spacy convert ./train.gold.conll ./corpus ``` +> #### 💡 Tip: Converting from Prodigy +> +> If you're using the [Prodigy](https://prodi.gy) annotation tool to create +> training data, you can run the +> [`data-to-spacy` command](https://prodi.gy/docs/recipes#data-to-spacy) to +> merge and export multiple datasets for use with +> [`spacy train`](/api/cli#train). Different types of annotations on the same +> text will be combined, giving you one corpus to train multiple components. + Training workflows often consist of multiple steps, from preprocessing the data @@ -823,6 +838,27 @@ data assets, track changes and share your end-to-end processes with your team. +The binary `.spacy` format is a serialized [`DocBin`](/api/docbin) containing +one or more [`Doc`](/api/doc) objects. It's is extremely **efficient in +storage**, especially when packing multiple documents together. You can also +create `Doc` objects manually, so you can write your own custom logic to convert +and store existing annotations for use in spaCy. + +```python +### Training data from Doc objects {highlight="6-9"} +import spacy +from spacy.tokens import Doc, DocBin + +nlp = spacy.blank("en") +docbin = DocBin(nlp.vocab) +words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."] +spaces = [True, True, True, True, True, True, True, False] +ents = [("ORG", 0, 1), ("GPE", 5, 6)] +doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents) +docbin.add(doc) +docbin.to_disk("./train.spacy") +``` + ### Working with corpora {#data-corpora} > #### Example