From 2298e129e68fd65b0dc928f747d0bcb1bac645b0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 7 Jul 2020 20:30:12 +0200 Subject: [PATCH] Update example and training docs --- website/docs/api/example.md | 2 + website/docs/images/training-loop.svg | 2 +- website/docs/usage/training.md | 92 ++++++++++++++++++--------- 3 files changed, 66 insertions(+), 30 deletions(-) diff --git a/website/docs/api/example.md b/website/docs/api/example.md index ca1b762c1..421828f95 100644 --- a/website/docs/api/example.md +++ b/website/docs/api/example.md @@ -23,6 +23,7 @@ both documents. > ```python > from spacy.tokens import Doc > from spacy.gold import Example +> > words = ["hello", "world", "!"] > spaces = [True, False, False] > predicted = Doc(nlp.vocab, words=words, spaces=spaces) @@ -50,6 +51,7 @@ annotations provided as a dictionary. > ```python > from spacy.tokens import Doc > from spacy.gold import Example +> > predicted = Doc(vocab, words=["Apply", "some", "sunscreen"]) > token_ref = ["Apply", "some", "sun", "screen"] > tags_ref = ["VERB", "DET", "NOUN", "NOUN"] diff --git a/website/docs/images/training-loop.svg b/website/docs/images/training-loop.svg index e883b36be..144fe2d3d 100644 --- a/website/docs/images/training-loop.svg +++ b/website/docs/images/training-loop.svg @@ -26,7 +26,7 @@ - GoldParse + Example diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 51282c2ab..597ade4e6 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -375,6 +375,18 @@ mattis pretium. ## Internal training API {#api} + + +spaCy gives you full control over the training loop. However, for most use +cases, it's recommended to train your models via the +[`spacy train`](/api/cli#train) command with a [`config.cfg`](#config) to keep +track of your settings and hyperparameters, instead of writing your own training +scripts from scratch. + + + + + The [`Example`](/api/example) object contains annotated training data, also called the **gold standard**. It's initialized with a [`Doc`](/api/doc) object that will hold the predictions, and another `Doc` object that holds the @@ -393,42 +405,52 @@ example = Example(predicted, reference) Alternatively, the `reference` `Doc` with the gold-standard annotations can be created from a dictionary with keyword arguments specifying the annotations, -like `tags` or `entities`: +like `tags` or `entities`. Using the `Example` object and its gold-standard +annotations, the model can be updated to learn a sentence of three words with +their assigned part-of-speech tags. + +> #### About the tag map +> +> The tag map is part of the vocabulary and defines the annotation scheme. If +> you're training a new language model, this will let you map the tags present +> in the treebank you train on to spaCy's tag scheme: +> +> ```python +> tag_map = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}} +> vocab = Vocab(tag_map=tag_map) +> ``` ```python words = ["I", "like", "stuff"] tags = ["NOUN", "VERB", "NOUN"] -predicted = Doc(en_vocab, words=words) +predicted = Doc(nlp.vocab, words=words) example = Example.from_dict(predicted, {"tags": tags}) ``` -Using the `Example` object and its gold-standard annotations, the model can be -updated to learn a sentence of three words with their assigned part-of-speech -tags. - - - -The [tag map](/usage/adding-languages#tag-map) is part of the vocabulary and -defines the annotation scheme. If you're training a new language model, this -will let you map the tags present in the treebank you train on to spaCy's tag -scheme: - -```python -vocab = Vocab(tag_map={"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}}) -``` - -Another example shows how to define gold-standard named entities: - -```python -doc = Doc(vocab, words=["Facebook", "released", "React", "in", "2014"]) -example = Example.from_dict(doc, {"entities": ["U-ORG", "O", "U-TECHNOLOGY", "O", "U-DATE"]}) -``` - +Here's another example that shows how to define gold-standard named entities. The letters added before the labels refer to the tags of the [BILUO scheme](/usage/linguistic-features#updating-biluo) – `O` is a token outside an entity, `U` an single entity unit, `B` the beginning of an entity, `I` a token inside an entity and `L` the last token of an entity. +```python +doc = Doc(nlp.vocab, words=["Facebook", "released", "React", "in", "2014"]) +example = Example.from_dict(doc, {"entities": ["U-ORG", "O", "U-TECHNOLOGY", "O", "U-DATE"]}) +``` + + + +As of v3.0, the [`Example`](/api/example) object replaces the `GoldParse` class. +It can be constructed in a very similar way, from a `Doc` and a dictionary of +annotations: + +```diff +- gold = GoldParse(doc, entities=entities) ++ example = Example.from_dict(doc, {"entities": entities}) +``` + + + > - **Training data**: The training examples. > - **Text and label**: The current example. > - **Doc**: A `Doc` object created from the example text. @@ -479,9 +501,21 @@ The [`nlp.update`](/api/language#update) method takes the following arguments: | `drop` | Dropout rate. Makes it harder for the model to just memorize the data. | | `sgd` | An [`Optimizer`](https://thinc.ai/docs/api-optimizers) object, which updated the model's weights. If not set, spaCy will create a new one and save it for further use. | - + -Instead of writing your own training loop, you can also use the built-in -[`train`](/api/cli#train) command, which expects data in spaCy's -[JSON format](/api/data-formats#json-input). On each epoch, a model will be -saved out to the directory. +As of v3.0, the [`Example`](/api/example) object replaces the `GoldParse` class +and the "simple training style" of calling `nlp.update` with a text and a +dictionary of annotations. Updating your code to use the `Example` object should +be very straightforward: you can call +[`Example.from_dict`](/api/example#from_dict) with a [`Doc`](/api/doc) and the +dictionary of annotations: + +```diff +text = "Facebook released React in 2014" +annotations = {"entities": ["U-ORG", "O", "U-TECHNOLOGY", "O", "U-DATE"]} ++ example = Example.from_dict(nlp.make_doc(text), {"entities": entities}) +- nlp.update([text], [annotations]) ++ nlp.update([example]) +``` + +