diff --git a/website/api/language.jade b/website/api/language.jade index 1ea129295..2e78dd633 100644 --- a/website/api/language.jade +++ b/website/api/language.jade @@ -157,12 +157,19 @@ p Update the models in the pipeline. +row +cell #[code docs] +cell iterable - +cell A batch of #[code Doc] objects. + +cell + | A batch of #[code Doc] objects or unicode. If unicode, a + | #[code Doc] object will be created from the text. +row +cell #[code golds] +cell iterable - +cell A batch of #[code GoldParse] objects. + +cell + | A batch of #[code GoldParse] objects or dictionaries. + | Dictionaries will be used to create + | #[+api("goldparse") #[code GoldParse]] objects. For the available + | keys and their usage, see + | #[+api("goldparse#init") #[code GoldParse.__init__]]. +row +cell #[code drop] diff --git a/website/usage/_training/_basics.jade b/website/usage/_training/_basics.jade index d20648416..6ac1477e8 100644 --- a/website/usage/_training/_basics.jade +++ b/website/usage/_training/_basics.jade @@ -172,15 +172,23 @@ p +row +cell #[code get_data] - +cell A function converting the training data to spaCy's JSON format. + +cell + | An optional function converting the training data to spaCy's + | JSON format. +row +cell #[code doc] - +cell #[+api("doc") #[code Doc]] objects. + +cell + | #[+api("doc") #[code Doc]] objects. The #[code update] method + | takes a sequence of them, so you can batch up your training + | examples. +row +cell #[code gold] - +cell #[+api("goldparse") #[code GoldParse]] objects. + +cell + | #[+api("goldparse") #[code GoldParse]] objects. The #[code update] + | method takes a sequence of them, so you can batch up your + | training examples. +row +cell #[code drop] @@ -197,3 +205,49 @@ p | a model will be saved out to the directory. After training, you can | use the #[+api("cli#package") #[code package]] command to generate an | installable Python package from your model. + ++h(3, "training-simple-style") Simple training style + +tag-new(2) + +p + | Instead of sequences of #[code Doc] and #[code GoldParse] objects, + | you can also use the "simple training style" and pass + | #[strong raw texts] and #[strong dictionaries of annotations] + | to #[+api("language#update") #[code nlp.update]]. + | The dictionaries can have the keys #[code entities], #[code heads], + | #[code deps], #[code tags] and #[code cats]. This is generally + | recommended, as it removes one layer of abstraction, and avoids + | unnecessary imports. It also makes it easier to structure and load + | your training data. + ++aside-code("Example Annotations"). + { + 'entities': [(0, 4, 'ORG')], + 'heads': [1, 1, 1, 5, 5, 2, 7, 5], + 'deps': ['nsubj', 'ROOT', 'prt', 'quantmod', 'compound', 'pobj', 'det', 'npadvmod'], + 'tags': ['PROPN', 'VERB', 'ADP', 'SYM', 'NUM', 'NUM', 'DET', 'NOUN'], + 'cats': {'BUSINESS': 1.0} + } + ++code("Simple training loop"). + TRAIN_DATA = [ + ("Uber blew through $1 million a week", {'entities': [(0, 4, 'ORG')]}), + ("Google rebrands its business apps", {'entities': [(0, 6, "ORG")]})] + + nlp = spacy.blank('en') + optimizer = nlp.begin_training() + for i in range(20): + random.shuffle(TRAIN_DATA) + for text, annotations in TRAIN_DATA: + nlp.update([text], [annotations], sgd=optimizer) + nlp.to_disk('/model') + +p + | The above training loop leaves out a few details that can really + | improve accuracy – but the principle really is #[em that] simple. Once + | you've got your pipeline together and you want to tune the accuracy, + | you usually want to process your training examples in batches, and + | experiment with #[+api("top-level#util.minibatch") #[code minibatch]] + | sizes and dropout rates, set via the #[code drop] keyword argument. See + | the #[+api("language") #[code Language]] and #[+api("pipe") #[code Pipe]] + | API docs for available options. diff --git a/website/usage/_training/_ner.jade b/website/usage/_training/_ner.jade index c1002ecdf..b8ef43916 100644 --- a/website/usage/_training/_ner.jade +++ b/website/usage/_training/_ner.jade @@ -39,12 +39,6 @@ p +h(4) Step by step guide +list("numbers") - +item - | #[strong Reformat the training data] to match spaCy's - | #[+a("/api/annotation#json-input") JSON format]. The built-in - | #[+api("goldparse#biluo_tags_from_offsets") #[code biluo_tags_from_offsets]] - | function can help you with this. - +item | #[strong Load the model] you want to start with, or create an | #[strong empty model] using @@ -56,17 +50,13 @@ p | This way, you'll only be training the entity recognizer. +item - | #[strong Shuffle and loop over] the examples and create a - | #[code Doc] and #[code GoldParse] object for each example. - - +item - | For each example, #[strong update the model] - | by calling #[+api("language#update") #[code nlp.update]], which steps + | #[strong Shuffle and loop over] the examples. For each example, + | #[strong update the model] by calling + | #[+api("language#update") #[code nlp.update]], which steps | through the words of the input. At each word, it makes a - | #[strong prediction]. It then consults the annotations provided on the - | #[code GoldParse] instance, to see whether it was - | right. If it was wrong, it adjusts its weights so that the correct - | action will score higher next time. + | #[strong prediction]. It then consults the annotations to see whether + | it was right. If it was wrong, it adjusts its weights so that the + | correct action will score higher next time. +item | #[strong Save] the trained model using @@ -90,13 +80,16 @@ p +github("spacy", "examples/training/train_new_entity_type.py", 500) ++aside("Important note", "⚠️") + | If you're using an existing model, make sure to mix in examples of + | #[strong other entity types] that spaCy correctly recognized before. + | Otherwise, your model might learn the new type, but "forget" what it + | previously knew. This is also referred to as the + | #[+a("https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting", true) "catastrophic forgetting" problem]. + +h(4) Step by step guide +list("numbers") - +item - | Create #[code Doc] and #[code GoldParse] objects for - | #[strong each example in your training data]. - +item | #[strong Load the model] you want to start with, or create an | #[strong empty model] using @@ -117,10 +110,9 @@ p | #[strong Loop over] the examples and call | #[+api("language#update") #[code nlp.update]], which steps through | the words of the input. At each word, it makes a - | #[strong prediction]. It then consults the annotations provided on the - | #[code GoldParse] instance, to see whether it was right. If it was - | wrong, it adjusts its weights so that the correct action will score - | higher next time. + | #[strong prediction]. It then consults the annotations, to see + | whether it was right. If it was wrong, it adjusts its weights so that + | the correct action will score higher next time. +item | #[strong Save] the trained model using diff --git a/website/usage/_training/_tagger-parser.jade b/website/usage/_training/_tagger-parser.jade index 646f9ecb0..13fc8e844 100644 --- a/website/usage/_training/_tagger-parser.jade +++ b/website/usage/_training/_tagger-parser.jade @@ -30,19 +30,13 @@ p | not necessary – but it doesn't hurt either, just to be safe. +item - | #[strong Shuffle and loop over] the examples and create a - | #[code Doc] and #[code GoldParse] object for each example. Make sure - | to pass in the #[code heads] and #[code deps] when you create the - | #[code GoldParse]. - - +item - | For each example, #[strong update the model] - | by calling #[+api("language#update") #[code nlp.update]], which steps - | through the words of the input. At each word, it makes a - | #[strong prediction]. It then consults the annotations provided on the - | #[code GoldParse] instance, to see whether it was - | right. If it was wrong, it adjusts its weights so that the correct - | action will score higher next time. + | #[strong Shuffle and loop over] the examples. For each example, + | #[strong update the model] by calling + | #[+api("language#update") #[code nlp.update]], which steps through + | the words of the input. At each word, it makes a + | #[strong prediction]. It then consults the annotations to see + | whether it was right. If it was wrong, it adjusts its weights so + | that the correct action will score higher next time. +item | #[strong Save] the trained model using @@ -67,26 +61,29 @@ p +list("numbers") +item - | #[strong Create] a new #[code Language] class and before initialising - | it, update the #[code tag_map] in its #[code Defaults] with your - | custom tags. + | #[strong Load the model] you want to start with, or create an + | #[strong empty model] using + | #[+api("spacy#blank") #[code spacy.blank]] with the ID of your + | language. If you're using a blank model, don't forget to add the + | tagger to the pipeline. If you're using an existing model, + | make sure to disable all other pipeline components during training + | using #[+api("language#disable_pipes") #[code nlp.disable_pipes]]. + | This way, you'll only be training the tagger. +item - | #[strong Create a new tagger] component and add it to the pipeline. + | #[strong Add the tag map] to the tagger using the + | #[+api("tagger#add_label") #[code add_label]] method. The first + | argument is the new tag name, the second the mapping to spaCy's + | coarse-grained tags, e.g. #[code {'pos': 'NOUN'}]. +item - | #[strong Shuffle and loop over] the examples and create a - | #[code Doc] and #[code GoldParse] object for each example. Make sure - | to pass in the #[code tags] when you create the #[code GoldParse]. - - +item - | For each example, #[strong update the model] - | by calling #[+api("language#update") #[code nlp.update]], which steps - | through the words of the input. At each word, it makes a - | #[strong prediction]. It then consults the annotations provided on the - | #[code GoldParse] instance, to see whether it was - | right. If it was wrong, it adjusts its weights so that the correct - | action will score higher next time. + | #[strong Shuffle and loop over] the examples. For each example, + | #[strong update the model] by calling + | #[+api("language#update") #[code nlp.update]], which steps through + | the words of the input. At each word, it makes a + | #[strong prediction]. It then consults the annotations to see whether + | it was right. If it was wrong, it adjusts its weights so that the + | correct action will score higher next time. +item | #[strong Save] the trained model using @@ -124,7 +121,7 @@ p | respective action – e.g. search the database for hotels with high ratings | for their wifi offerings. -+aside("Tip: merge phrases and entities") ++aside("Tip: merge phrases and entities", "💡") | To achieve even better accuracy, try merging multi-word tokens and | entities specific to your domain into one token before parsing your text. | You can do this by running the entity recognizer or @@ -160,9 +157,10 @@ p | #[strong empty model] using | #[+api("spacy#blank") #[code spacy.blank]] with the ID of your | language. If you're using a blank model, don't forget to add the - | parser to the pipeline. If you're using an existing model, - | make sure to disable all other pipeline components during training - | using #[+api("language#disable_pipes") #[code nlp.disable_pipes]]. + | custom parser to the pipeline. If you're using an existing model, + | make sure to #[strong remove the old parser] from the pipeline, and + | disable all other pipeline components during training using + | #[+api("language#disable_pipes") #[code nlp.disable_pipes]]. | This way, you'll only be training the parser. +item @@ -170,19 +168,13 @@ p | #[+api("dependencyparser#add_label") #[code add_label]] method. +item - | #[strong Shuffle and loop over] the examples and create a - | #[code Doc] and #[code GoldParse] object for each example. Make sure - | to pass in the #[code heads] and #[code deps] when you create the - | #[code GoldParse]. - - +item - | For each example, #[strong update the model] - | by calling #[+api("language#update") #[code nlp.update]], which steps + | #[strong Shuffle and loop over] the examples. For each example, + | #[strong update the model] by calling + | #[+api("language#update") #[code nlp.update]], which steps | through the words of the input. At each word, it makes a - | #[strong prediction]. It then consults the annotations provided on the - | #[code GoldParse] instance, to see whether it was - | right. If it was wrong, it adjusts its weights so that the correct - | action will score higher next time. + | #[strong prediction]. It then consults the annotations to see whether + | it was right. If it was wrong, it adjusts its weights so that the + | correct action will score higher next time. +item | #[strong Save] the trained model using diff --git a/website/usage/_training/_textcat.jade b/website/usage/_training/_textcat.jade index b7b47c3ba..7c53966be 100644 --- a/website/usage/_training/_textcat.jade +++ b/website/usage/_training/_textcat.jade @@ -35,17 +35,18 @@ p | be able to see results on each training iteration. +item - | #[strong Loop over] the training examples, partition them into - | batches and create #[code Doc] and #[code GoldParse] objects for each - | example in the batch. + | #[strong Loop over] the training examples and partition them into + | batches using spaCy's + | #[+api("top-level#util.minibatch") #[code minibatch]] and + | #[+api("top-level#util.compounding") #[code compounding]] helpers. +item | #[strong Update the model] by calling | #[+api("language#update") #[code nlp.update]], which steps | through the examples and makes a #[strong prediction]. It then - | consults the annotations provided on the #[code GoldParse] instance, - | to see whether it was right. If it was wrong, it adjusts its weights - | so that the correct prediction will score higher next time. + | consults the annotations to see whether it was right. If it was + | wrong, it adjusts its weights so that the correct prediction will + | score higher next time. +item | Optionally, you can also #[strong evaluate the text classifier] on diff --git a/website/usage/_v2/_migrating.jade b/website/usage/_v2/_migrating.jade index 549bd628b..e2211f86f 100644 --- a/website/usage/_v2/_migrating.jade +++ b/website/usage/_v2/_migrating.jade @@ -110,17 +110,23 @@ p | spaCy when to #[em stop], you can now explicitly call | #[+api("language#begin_training") #[code begin_taining]], which | returns an optimizer you can pass into the - | #[+api("language#update") #[code update]] function. + | #[+api("language#update") #[code update]] function. While #[code update] + | still accepts sequences of #[code Doc] and #[code GoldParse] objects, + | you can now also pass in a list of strings and dictionaries describing + | the annotations. This is the recommended usage, as it removes one layer + | of abstraction from the training. +code-new. optimizer = nlp.begin_training() for itn in range(1000): - for doc, gold in train_data: - nlp.update([doc], [gold], sgd=optimizer) + for texts, annotations in train_data: + nlp.update(texts, annotations, sgd=optimizer) nlp.to_disk('/model') +code-old. for itn in range(1000): - for doc, gold in train_data: + for text, entities in train_data: + doc = Doc(text) + gold = GoldParse(doc, entities=entities) nlp.update(doc, gold) nlp.end_training() nlp.save_to_directory('/model')