mirror of https://github.com/explosion/spaCy.git
Update augmenter lookups and docs
This commit is contained in:
parent
5128298964
commit
a103ab5f1a
|
@ -477,6 +477,12 @@ class Errors:
|
||||||
E201 = ("Span index out of range.")
|
E201 = ("Span index out of range.")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E912 = ("No orth_variants lookups table for data augmentation available for "
|
||||||
|
"language '{lang}'. If orth_variants are available in "
|
||||||
|
"spacy-lookups-data, make sure the package is installed and the "
|
||||||
|
"table is loaded in the [initialize.lookups] block of your config. "
|
||||||
|
"Alternatively, you can provide your own Lookups object with a "
|
||||||
|
"table orth_variants as the argument 'lookuos' of the augmenter.")
|
||||||
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
|
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
|
||||||
"config.cfg or override it on the CLI?")
|
"config.cfg or override it on the CLI?")
|
||||||
E914 = ("Executing {name} callback failed. Expected the function to "
|
E914 = ("Executing {name} callback failed. Expected the function to "
|
||||||
|
|
|
@ -7,6 +7,7 @@ from spacy.training.converters import json_to_docs
|
||||||
from spacy.training.augment import create_orth_variants_augmenter
|
from spacy.training.augment import create_orth_variants_augmenter
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.tokens import Doc, DocBin
|
from spacy.tokens import Doc, DocBin
|
||||||
|
from spacy.lookups import Lookups
|
||||||
from spacy.util import get_words_and_spaces, minibatch
|
from spacy.util import get_words_and_spaces, minibatch
|
||||||
from thinc.api import compounding
|
from thinc.api import compounding
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -492,13 +493,20 @@ def test_roundtrip_docs_to_docbin(doc):
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_make_orth_variants(doc):
|
def test_make_orth_variants(doc):
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
orth_variants = {
|
||||||
|
"single": [
|
||||||
|
{"tags": ["NFP"], "variants": ["…", "..."]},
|
||||||
|
{"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
lookups = Lookups()
|
||||||
|
lookups.add_table("orth_variants", orth_variants)
|
||||||
|
augmenter = create_orth_variants_augmenter(level=0.2, lower=0.5, lookups=lookups)
|
||||||
with make_tempdir() as tmpdir:
|
with make_tempdir() as tmpdir:
|
||||||
output_file = tmpdir / "roundtrip.spacy"
|
output_file = tmpdir / "roundtrip.spacy"
|
||||||
DocBin(docs=[doc]).to_disk(output_file)
|
DocBin(docs=[doc]).to_disk(output_file)
|
||||||
# due to randomness, test only that this runs with no errors for now
|
# due to randomness, test only that this runs with no errors for now
|
||||||
reader = Corpus(
|
reader = Corpus(output_file, augmenter=augmenter)
|
||||||
output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5)
|
|
||||||
)
|
|
||||||
list(reader(nlp))
|
list(reader(nlp))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,30 +1,50 @@
|
||||||
from typing import Callable
|
from typing import Callable, Iterator, Dict, List, Tuple, Optional, TYPE_CHECKING
|
||||||
import random
|
import random
|
||||||
import itertools
|
import itertools
|
||||||
import copy
|
import copy
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from ..util import registry
|
|
||||||
|
from ..util import registry, logger
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
from .example import Example
|
||||||
|
from ..lookups import Lookups
|
||||||
|
from ..errors import Errors
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
@registry.augmenters("spacy.dont_augment.v1")
|
from ..language import Language # noqa: F401
|
||||||
def create_null_augmenter():
|
|
||||||
return dont_augment
|
|
||||||
|
|
||||||
|
|
||||||
@registry.augmenters("spacy.orth_variants.v1")
|
@registry.augmenters("spacy.orth_variants.v1")
|
||||||
def create_orth_variants_augmenter(level: float, lower: float) -> Callable:
|
def create_orth_variants_augmenter(
|
||||||
|
level: float, lower: float, lookups: Optional[Lookups] = None,
|
||||||
|
) -> Callable[["Language", Example], Iterator[Example]]:
|
||||||
"""Create a data augmentation callback that uses orth-variant replacement.
|
"""Create a data augmentation callback that uses orth-variant replacement.
|
||||||
The callback can be added to a corpus or other data iterator during training.
|
The callback can be added to a corpus or other data iterator during training.
|
||||||
"""
|
"""
|
||||||
return partial(orth_variants_augmenter, level=level, lower=lower)
|
return partial(orth_variants_augmenter, level=level, lower=lower, lookups=lookups)
|
||||||
|
|
||||||
|
|
||||||
def dont_augment(nlp, example):
|
def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
|
||||||
yield example
|
yield example
|
||||||
|
|
||||||
|
|
||||||
def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float = 0.0):
|
def orth_variants_augmenter(
|
||||||
|
nlp: "Language",
|
||||||
|
example: Example,
|
||||||
|
*,
|
||||||
|
level: float = 0.0,
|
||||||
|
lower: float = 0.0,
|
||||||
|
lookups: Optional[Lookups] = None,
|
||||||
|
) -> Iterator[Example]:
|
||||||
|
table_name = "orth_variants"
|
||||||
|
if lookups is not None:
|
||||||
|
orth_variants = lookups.get_table(table_name, {})
|
||||||
|
logger.debug("Using data augmentation orth variants from provided lookups")
|
||||||
|
else:
|
||||||
|
orth_variants = nlp.vocab.lookups.get_table(table_name, {})
|
||||||
|
logger.debug("Using data augmentation orth variants from default vocab lookups")
|
||||||
|
if not orth_variants:
|
||||||
|
raise ValueError(Errors.E912.format(lang=nlp.lang))
|
||||||
if random.random() >= level:
|
if random.random() >= level:
|
||||||
yield example
|
yield example
|
||||||
else:
|
else:
|
||||||
|
@ -37,6 +57,7 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float =
|
||||||
nlp,
|
nlp,
|
||||||
raw_text,
|
raw_text,
|
||||||
orig_dict["token_annotation"],
|
orig_dict["token_annotation"],
|
||||||
|
orth_variants,
|
||||||
lower=raw_text is not None and random.random() < lower,
|
lower=raw_text is not None and random.random() < lower,
|
||||||
)
|
)
|
||||||
if variant_text:
|
if variant_text:
|
||||||
|
@ -49,9 +70,15 @@ def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float =
|
||||||
yield example.from_dict(doc, orig_dict)
|
yield example.from_dict(doc, orig_dict)
|
||||||
|
|
||||||
|
|
||||||
def make_orth_variants(nlp, raw, token_dict, *, lower: bool = False):
|
def make_orth_variants(
|
||||||
|
nlp: "Language",
|
||||||
|
raw: str,
|
||||||
|
token_dict: Dict[str, List[str]],
|
||||||
|
orth_variants: Dict[str, list],
|
||||||
|
*,
|
||||||
|
lower: bool = False,
|
||||||
|
) -> Tuple[str, Dict[str, List[str]]]:
|
||||||
orig_token_dict = copy.deepcopy(token_dict)
|
orig_token_dict = copy.deepcopy(token_dict)
|
||||||
orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
|
|
||||||
ndsv = orth_variants.get("single", [])
|
ndsv = orth_variants.get("single", [])
|
||||||
ndpv = orth_variants.get("paired", [])
|
ndpv = orth_variants.get("paired", [])
|
||||||
words = token_dict.get("words", [])
|
words = token_dict.get("words", [])
|
||||||
|
|
|
@ -7,9 +7,11 @@ new: 3
|
||||||
---
|
---
|
||||||
|
|
||||||
This class manages annotated corpora and can be used for training and
|
This class manages annotated corpora and can be used for training and
|
||||||
development datasets in the [DocBin](/api/docbin) (`.spacy`) format. To
|
development datasets in the [`DocBin`](/api/docbin) (`.spacy`) format. To
|
||||||
customize the data loading during training, you can register your own
|
customize the data loading during training, you can register your own
|
||||||
[data readers and batchers](/usage/training#custom-code-readers-batchers).
|
[data readers and batchers](/usage/training#custom-code-readers-batchers). Also
|
||||||
|
see the usage guide on [data utilities](/usage/training#data) for more details
|
||||||
|
and examples.
|
||||||
|
|
||||||
## Config and implementation {#config}
|
## Config and implementation {#config}
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,8 @@ menu:
|
||||||
- ['Loggers', 'loggers']
|
- ['Loggers', 'loggers']
|
||||||
- ['Readers', 'readers']
|
- ['Readers', 'readers']
|
||||||
- ['Batchers', 'batchers']
|
- ['Batchers', 'batchers']
|
||||||
- ['Data & Alignment', 'gold']
|
- ['Augmenters', 'augmenters']
|
||||||
|
- ['Training & Alignment', 'gold']
|
||||||
- ['Utility Functions', 'util']
|
- ['Utility Functions', 'util']
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -313,6 +314,7 @@ factories.
|
||||||
| Registry name | Description |
|
| Registry name | Description |
|
||||||
| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
|
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
|
||||||
|
| `augmenters` | Registry for functions that create [data augmentation](#augmenters) callbacks for corpora and other training data iterators. |
|
||||||
| `batchers` | Registry for training and evaluation [data batchers](#batchers). |
|
| `batchers` | Registry for training and evaluation [data batchers](#batchers). |
|
||||||
| `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. |
|
| `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. |
|
||||||
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
||||||
|
@ -618,6 +620,34 @@ sequences in the batch.
|
||||||
| `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~ |
|
| `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~ |
|
||||||
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
||||||
|
|
||||||
|
## Augmenters {#augmenters source="spacy/training/augment.py" new="3"}
|
||||||
|
|
||||||
|
<!-- TODO: intro, explain data augmentation concept -->
|
||||||
|
|
||||||
|
### orth_variants {#orth_variants tag="registered function"}
|
||||||
|
|
||||||
|
> #### Example config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [corpora.train.augmenter]
|
||||||
|
> @augmenters = "spacy.orth_variants.v1"
|
||||||
|
> level = 0.0
|
||||||
|
> lower = 0.0
|
||||||
|
> lookups = null
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Create a data augmentation callback that uses orth-variant replacement. The
|
||||||
|
callback can be added to a corpus or other data iterator during training. This
|
||||||
|
is especially useful for punctuation and case replacement, to help generalize
|
||||||
|
beyond corpora that don't have smart quotes, or only have smart quotes etc.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| `level` | ~~float~~ |
|
||||||
|
| `lower` | ~~float~~ |
|
||||||
|
| `lookups` | Lookups table containing the orth variants to use. See [`orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. If not set, tables from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) are used if available and added in the [`[initialize]`](/api/data-formats#config-initialize) block of the config. If no orth variants are found, spaCy will raise an error. Defaults to `None`. ~~Optional[Lookups]~~ |
|
||||||
|
| **RETURNS** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ |
|
||||||
|
|
||||||
## Training data and alignment {#gold source="spacy/training"}
|
## Training data and alignment {#gold source="spacy/training"}
|
||||||
|
|
||||||
### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
|
### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
|
||||||
|
|
|
@ -805,15 +805,30 @@ def MyModel(output_width: int) -> Model[List[Doc], List[Floats2d]]:
|
||||||
|
|
||||||
## Data utilities {#data}
|
## Data utilities {#data}
|
||||||
|
|
||||||
spaCy includes various features and utilities to make it easy to train from your
|
spaCy includes various features and utilities to make it easy to train models
|
||||||
own data. If you have training data in a standard format like `.conll` or
|
using your own data, manage training and evaluation corpora, convert existing
|
||||||
`.conllu`, the easiest way to convert it for use with spaCy is to run
|
annotations and configure data augmentation strategies for more robust models.
|
||||||
[`spacy convert`](/api/cli#convert) and pass it a file and an output directory:
|
|
||||||
|
### Converting existing corpora and annotations {#data-convert}
|
||||||
|
|
||||||
|
If you have training data in a standard format like `.conll` or `.conllu`, the
|
||||||
|
easiest way to convert it for use with spaCy is to run
|
||||||
|
[`spacy convert`](/api/cli#convert) and pass it a file and an output directory.
|
||||||
|
By default, the command will pick the converter based on the file extension.
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy convert ./train.gold.conll ./corpus
|
$ python -m spacy convert ./train.gold.conll ./corpus
|
||||||
```
|
```
|
||||||
|
|
||||||
|
> #### 💡 Tip: Converting from Prodigy
|
||||||
|
>
|
||||||
|
> If you're using the [Prodigy](https://prodi.gy) annotation tool to create
|
||||||
|
> training data, you can run the
|
||||||
|
> [`data-to-spacy` command](https://prodi.gy/docs/recipes#data-to-spacy) to
|
||||||
|
> merge and export multiple datasets for use with
|
||||||
|
> [`spacy train`](/api/cli#train). Different types of annotations on the same
|
||||||
|
> text will be combined, giving you one corpus to train multiple components.
|
||||||
|
|
||||||
<Infobox title="Tip: Manage multi-step workflows with projects" emoji="💡">
|
<Infobox title="Tip: Manage multi-step workflows with projects" emoji="💡">
|
||||||
|
|
||||||
Training workflows often consist of multiple steps, from preprocessing the data
|
Training workflows often consist of multiple steps, from preprocessing the data
|
||||||
|
@ -823,6 +838,27 @@ data assets, track changes and share your end-to-end processes with your team.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
The binary `.spacy` format is a serialized [`DocBin`](/api/docbin) containing
|
||||||
|
one or more [`Doc`](/api/doc) objects. It's is extremely **efficient in
|
||||||
|
storage**, especially when packing multiple documents together. You can also
|
||||||
|
create `Doc` objects manually, so you can write your own custom logic to convert
|
||||||
|
and store existing annotations for use in spaCy.
|
||||||
|
|
||||||
|
```python
|
||||||
|
### Training data from Doc objects {highlight="6-9"}
|
||||||
|
import spacy
|
||||||
|
from spacy.tokens import Doc, DocBin
|
||||||
|
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
docbin = DocBin(nlp.vocab)
|
||||||
|
words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
|
||||||
|
spaces = [True, True, True, True, True, True, True, False]
|
||||||
|
ents = [("ORG", 0, 1), ("GPE", 5, 6)]
|
||||||
|
doc = Doc(nlp.vocab, words=words, spaces=spaces, ents=ents)
|
||||||
|
docbin.add(doc)
|
||||||
|
docbin.to_disk("./train.spacy")
|
||||||
|
```
|
||||||
|
|
||||||
### Working with corpora {#data-corpora}
|
### Working with corpora {#data-corpora}
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
|
Loading…
Reference in New Issue