From bea6e6bfad846d21941aa3638f92f5f1e2fc51e7 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 1 Jun 2017 11:51:14 +0200 Subject: [PATCH 01/10] Allow annotation row to take children --- website/_includes/_mixins.jade | 1 + 1 file changed, 1 insertion(+) diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index 05e64b0fa..ce8bfad4e 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -382,3 +382,4 @@ mixin annotation-row(annots, style) +cell #[code=cell] else +cell=cell + block From 9c975c488250a64670fe705e313021081e052515 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 1 Jun 2017 11:51:22 +0200 Subject: [PATCH 02/10] Add training illustrations --- website/assets/img/docs/training-loop.svg | 40 +++++++++++++++++++ website/assets/img/docs/training.svg | 47 +++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 website/assets/img/docs/training-loop.svg create mode 100644 website/assets/img/docs/training.svg diff --git a/website/assets/img/docs/training-loop.svg b/website/assets/img/docs/training-loop.svg new file mode 100644 index 000000000..c0acd10cf --- /dev/null +++ b/website/assets/img/docs/training-loop.svg @@ -0,0 +1,40 @@ + + + + + + + + Training data + + + + label + + + + text + + + + + + Doc + + + + GoldParse + + + + update + + nlp + + + + optimizer + diff --git a/website/assets/img/docs/training.svg b/website/assets/img/docs/training.svg new file mode 100644 index 000000000..cd6b74f04 --- /dev/null +++ b/website/assets/img/docs/training.svg @@ -0,0 +1,47 @@ + + + + + + + + + + + + + PREDICT + + + + SAVE + + Model + + + + + + Training data + + + + label + + + + label + + Updated + Model + + + text + + + + GRADIENT + From 77dca25c7f18fe55c969914aeb9d0576d2df868b Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 1 Jun 2017 11:51:31 +0200 Subject: [PATCH 03/10] Update Language API docs --- website/docs/api/language.jade | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/website/docs/api/language.jade b/website/docs/api/language.jade index 9e45a89d9..9c26f506c 100644 --- a/website/docs/api/language.jade +++ b/website/docs/api/language.jade @@ -141,10 +141,10 @@ p p Update the models in the pipeline. +aside-code("Example"). - with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer): - for epoch in trainer.epochs(gold): - for docs, golds in epoch: - state = nlp.update(docs, golds, sgd=optimizer) + for raw_text, entity_offsets in train_data: + doc = nlp.make_doc(raw_text) + gold = GoldParse(doc, entities=entity_offsets) + nlp.update([doc], [gold], drop=0.5, sgd=optimizer) +table(["Name", "Type", "Description"]) +row @@ -173,17 +173,13 @@ p Update the models in the pipeline. +cell Results from the update. +h(2, "begin_training") Language.begin_training - +tag contextmanager + +tag method p - | Allocate models, pre-process training data and acquire a trainer and - | optimizer. Used as a contextmanager. + | Allocate models, pre-process training data and acquire an optimizer. +aside-code("Example"). - with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer): - for epoch in trainer.epochs(gold): - for docs, golds in epoch: - state = nlp.update(docs, golds, sgd=optimizer) + optimizer = nlp.begin_training(gold_tuples) +table(["Name", "Type", "Description"]) +row @@ -199,7 +195,7 @@ p +footrow +cell yields +cell tuple - +cell A trainer and an optimizer. + +cell An optimizer. +h(2, "use_params") Language.use_params +tag contextmanager From d5c8d2f5fd4177b6f4980689ae972352563c28e5 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 1 Jun 2017 11:52:24 +0200 Subject: [PATCH 04/10] Update about.py and increment version --- spacy/about.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/about.py b/spacy/about.py index 38e934374..aa42ae05d 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,11 +3,11 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy' -__version__ = '1.8.2' +__version__ = '2.0.0' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' -__author__ = 'Matthew Honnibal' -__email__ = 'matt@explosion.ai' +__author__ = 'Explosion AI' +__email__ = 'contact@explosion.ai' __license__ = 'MIT' __docs_models__ = 'https://spacy.io/docs/usage/models' From 72380c952a8d26ede5cfc8726f3347d0e9f22a48 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 1 Jun 2017 11:52:49 +0200 Subject: [PATCH 05/10] Update training section in NER guide and add links --- website/docs/usage/entity-recognition.jade | 41 ++++++++-------------- 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/website/docs/usage/entity-recognition.jade b/website/docs/usage/entity-recognition.jade index f33ef70df..7fd0a6d37 100644 --- a/website/docs/usage/entity-recognition.jade +++ b/website/docs/usage/entity-recognition.jade @@ -154,40 +154,29 @@ p | To provide training examples to the entity recogniser, you'll first need | to create an instance of the #[+api("goldparse") #[code GoldParse]] class. | You can specify your annotations in a stand-off format or as token tags. - -+code. - import random - import spacy - from spacy.gold import GoldParse - from spacy.pipeline import EntityRecognizer - - train_data = [('Who is Chaka Khan?', [(7, 17, 'PERSON')]), - ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])] - - nlp = spacy.load('en', entity=False, parser=False) - ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC']) - - for itn in range(5): - random.shuffle(train_data) - for raw_text, entity_offsets in train_data: - doc = nlp.make_doc(raw_text) - gold = GoldParse(doc, entities=entity_offsets) - - nlp.tagger(doc) - ner.update(doc, gold) - -p | If a character offset in your entity annotations don't fall on a token | boundary, the #[code GoldParse] class will treat that annotation as a | missing value. This allows for more realistic training, because the | entity recogniser is allowed to learn from examples that may feature | tokenizer errors. -+aside-code("Example"). ++code. + train_data = [('Who is Chaka Khan?', [(7, 17, 'PERSON')]), + ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])] + ++code. doc = Doc(nlp.vocab, [u'rats', u'make', u'good', u'pets']) gold = GoldParse(doc, [u'U-ANIMAL', u'O', u'O', u'O']) - ner = EntityRecognizer(nlp.vocab, entity_types=['ANIMAL']) - ner.update(doc, gold) + ++infobox + | For more details on #[strong training and updating] the named entity + | recognizer, see the usage guides on #[+a("/docs/usage/training") training] + | and #[+a("/docs/usage/training-ner") training the named entity recognizer], + | or check out the runnable + | #[+src(gh("spaCy", "examples/training/train_ner.py")) training script] + | on GitHub. + ++h(3, "updating-biluo") The BILUO Scheme p | You can also provide token-level entity annotation, using the From abed463bbb19341f13511352398f5fcba86d5d1d Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 1 Jun 2017 11:52:58 +0200 Subject: [PATCH 06/10] Update serialization 101 --- .../docs/usage/_spacy-101/_serialization.jade | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/website/docs/usage/_spacy-101/_serialization.jade b/website/docs/usage/_spacy-101/_serialization.jade index 5620a6151..27804344e 100644 --- a/website/docs/usage/_spacy-101/_serialization.jade +++ b/website/docs/usage/_spacy-101/_serialization.jade @@ -1,12 +1,12 @@ //- 💫 DOCS > USAGE > SPACY 101 > SERIALIZATION p - | If you've been modifying the pipeline, vocabulary vectors and entities, or made - | updates to the model, you'll eventually want - | to #[strong save your progress] – for example, everything that's in your #[code nlp] - | object. This means you'll have to translate its contents and structure - | into a format that can be saved, like a file or a byte string. This - | process is called serialization. spaCy comes with + | If you've been modifying the pipeline, vocabulary, vectors and entities, + | or made updates to the model, you'll eventually want to + | #[strong save your progress] – for example, everything that's in your + | #[code nlp] object. This means you'll have to translate its contents and + | structure into a format that can be saved, like a file or a byte string. + | This process is called serialization. spaCy comes with | #[strong built-in serialization methods] and supports the | #[+a("http://www.diveintopython3.net/serializing.html#dump") Pickle protocol]. @@ -45,11 +45,7 @@ p | #[code Vocab] holds the context-independent information about the words, | tags and labels, and their #[strong hash values]. If the #[code Vocab] | wasn't saved with the #[code Doc], spaCy wouldn't know how to resolve - | those IDs – for example, the word text or the dependency labels. You - | might be saving #[code 446] for "whale", but in a different vocabulary, - | this ID could map to "VERB". Similarly, if your document was processed by - | a German model, its vocab will include the specific - | #[+a("/docs/api/annotation#dependency-parsing-german") German dependency labels]. + | those IDs back to strings. +code. moby_dick = open('moby_dick.txt', 'r') # open a large document From 2f40d6e7e762e4ddb2d203001b14a977be140d52 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 1 Jun 2017 11:53:16 +0200 Subject: [PATCH 07/10] Add training 101 --- website/docs/usage/_spacy-101/_training.jade | 51 +++++++++++++++++++- website/docs/usage/spacy-101.jade | 6 +++ 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/website/docs/usage/_spacy-101/_training.jade b/website/docs/usage/_spacy-101/_training.jade index f4a0c7194..9b283c0eb 100644 --- a/website/docs/usage/_spacy-101/_training.jade +++ b/website/docs/usage/_spacy-101/_training.jade @@ -1,3 +1,52 @@ //- 💫 DOCS > USAGE > SPACY 101 > TRAINING -+under-construction +p + | spaCy's models are #[strong statistical] and every "decision" they make – + | for example, which part-of-speech tag to assign, or whether a word is a + | named entity – is a #[strong prediction]. This prediction is based + | on the examples the model has seen during #[strong training]. To train + | a model, you first need training data – examples of text, and the + | labels you want the model to predict. This could be a part-of-speech tag, + | a named entity or any other information. + +p + | The model is then shown the unlabelled text and will make a prediction. + | Because we know the correct answer, we can give the model feedback on its + | prediction in the form of an #[strong error gradient] of the + | #[strong loss function] that calculates the difference between the training + | example and the expected output. The greater the difference, the more + | significant the gradient and the updates to our model. + ++aside + | #[strong Training data:] Examples and their annotations.#[br] + | #[strong Text:] The input text the model should predict a label for.#[br] + | #[strong Label:] The label the model should predict.#[br] + | #[strong Gradient:] Gradient of the loss function calculating the + | difference between input and expected output. + ++image + include ../../../assets/img/docs/training.svg + .u-text-right + +button("/assets/img/docs/training.svg", false, "secondary").u-text-tag View large graphic + +p + | When training a model, we don't just want it to memorise our examples – + | we want it to come up with theory that can be + | #[strong generalised across other examples]. After all, we don't just want + | the model to learn that this one instance of "Amazon" right here is a + | company – we want it to learn that "Amazon", in contexts #[em like this], + | is most likely a company. That's why the training data should always be + | representative of the data we want to process. A model trained on + | Wikipedia, where sentences in the first person are extremely rare, will + | likely perform badly on Twitter. Similarly, a model trained on romantic + | novels will likely perform badly on legal text. + +p + | This also means that in order to know how the model is performing, + | and whether it's learning the right things, you don't only need + | #[strong training data] – you'll also need #[strong evaluation data]. If + | you only test the model with the data it was trained on, you'll have no + | idea how well it's generalising. If you want to train a model from scratch, + | you usually need at least a few hundred examples for both training and + | evaluation. To update an existing model, you can already achieve decent + | results with very few examples – as long as they're representative. diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index e1300b5b0..55e7a030a 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -252,6 +252,12 @@ include _spacy-101/_serialization include _spacy-101/_training ++infobox + | To learn more about #[strong training and updating] models, how to create + | training data and how to improve spaCy's named entity recognition models, + | see the usage guides on #[+a("/docs/usage/training") training] and + | #[+a("/docs/usage/training-ner") training the named entity recognizer]. + +h(2, "architecture") Architecture +under-construction From 789e69b73f9ac96c498a49e87037c03cdb86e403 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 1 Jun 2017 11:53:23 +0200 Subject: [PATCH 08/10] Update training guide --- website/docs/usage/training.jade | 211 ++++++++++++++++++++++++------- 1 file changed, 168 insertions(+), 43 deletions(-) diff --git a/website/docs/usage/training.jade b/website/docs/usage/training.jade index cff51d250..c1a7c1835 100644 --- a/website/docs/usage/training.jade +++ b/website/docs/usage/training.jade @@ -10,68 +10,193 @@ p include _spacy-101/_training -+h(2, "train-pos-tagger") Training the part-of-speech tagger ++h(3, "training-data") How do I get training data? + +p + | Collecting training data may sound incredibly painful – and it can be, + | if you're planning a large-scale annotation project. However, if your main + | goal is to update an existing model's predictions – for example, spaCy's + | named entity recognition – the hard is part usually not creating the + | actual annotations. It's finding representative examples and + | #[strong extracting potential candidates]. The good news is, if you've + | been noticing bad performance on your data, you likely + | already have some relevant text, and you can use spaCy to + | #[strong bootstrap a first set of training examples]. For example, + | after processing a few sentences, you may end up with the following + | entities, some correct, some incorrect. + ++aside("How many examples do I need?") + | As a rule of thumb, you should allocate at least 10% of your project + | resources to creating training and evaluation data. If you're looking to + | improve an existing model, you might be able to start off with only a + | handful of examples. Keep in mind that you'll always want a lot more than + | that for #[strong evaluation] – especially previous errors the model has + | made. Otherwise, you won't be able to sufficiently verify that the model + | has actually made the #[strong correct generalisations] required for your + | use case. + ++table(["Text", "Entity", "Start", "End", "Label", ""]) + - var style = [0, 0, 1, 1, 1] + +annotation-row(["Uber blew through $1 million a week", "Uber", 0, 4, "ORG"], style) + +cell #[+procon("pro")] + +annotation-row(["Android Pay expands to Canada", "Android", 0, 7, "PERSON"], style) + +cell #[+procon("con")] + +annotation-row(["Android Pay expands to Canada", "Canada", 23, 30, "GPE"], style) + +cell #[+procon("pro")] + +annotation-row(["Spotify steps up Asia expansion", "Spotify", 0, 8, "ORG"], style) + +cell #[+procon("pro")] + +annotation-row(["Spotify steps up Asia expansion", "Asia", 17, 21, "NORP"], style) + +cell #[+procon("con")] + +p + | Alternatively, the + | #[+a("/docs/usage/rule-based-matching#example3") rule-based matcher] + | can be a useful tool to extract tokens or combinations of tokens, as + | well as their start and end index in a document. In this case, we'll + | extract mentions of Google and assume they're an #[code ORG]. + ++table(["Text", "Entity", "Start", "End", "Label", ""]) + - var style = [0, 0, 1, 1, 1] + +annotation-row(["let me google this for you", "google", 7, 13, "ORG"], style) + +cell #[+procon("con")] + +annotation-row(["Google Maps launches location sharing", "Google", 0, 6, "ORG"], style) + +cell #[+procon("con")] + +annotation-row(["Google rebrands its business apps", "Google", 0, 6, "ORG"], style) + +cell #[+procon("pro")] + +annotation-row(["look what i found on google! 😂", "google", 21, 27, "ORG"], style) + +cell #[+procon("con")] + +p + | Based on the few examples above, you can already create six training + | sentences with eight entities in total. Of course, what you consider a + | "correct annotation" will always depend on + | #[strong what you want the model to learn]. While there are some entity + | annotations that are more or less universally correct – like Canada being + | a geopolitical entity – your application may have its very own definition + | of the #[+a("/docs/api/annotation#named-entities") NER annotation scheme]. +code. - from spacy.vocab import Vocab - from spacy.tagger import Tagger - from spacy.tokens import Doc - from spacy.gold import GoldParse + train_data = [ + ("Uber blew through $1 million a week", [(0, 4, 'ORG')]), + ("Android Pay expands to Canada", [(0, 11, 'PRODUCT'), (23, 30, 'GPE')]), + ("Spotify steps up Asia expansion", [(0, 8, "ORG"), (17, 21, "LOC")]), + ("Google Maps launches location sharing", [(0, 11, "PRODUCT")]), + ("Google rebrands its business apps", [(0, 6, "ORG")]), + ("look what i found on google! 😂", [(21, 27, "PRODUCT")])] ++h(2) Training with annotations +p + | The #[+api("goldparse") #[code GoldParse]] object collects the annotated + | training examples, also called the #[strong gold standard]. It's + | initialised with the #[+api("doc") #[code Doc]] object it refers to, + | and keyword arguments specifying the annotations, like #[code tags] + | or #[code entities]. Its job is to encode the annotations, keep them + | aligned and create the C-level data structures required for efficient access. + | Here's an example of a simple #[code GoldParse] for part-of-speech tags: + ++code. vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}}) - tagger = Tagger(vocab) - doc = Doc(vocab, words=['I', 'like', 'stuff']) gold = GoldParse(doc, tags=['N', 'V', 'N']) - tagger.update(doc, gold) p - +button(gh("spaCy", "examples/training/train_tagger.py"), false, "secondary") Full example - -+h(2, "train-entity") Training the named entity recognizer + | Using the #[code Doc] and its gold-standard annotations, the model can be + | updated to learn a sentence of three words with their assigned + | part-of-speech tags. The #[+a("/docs/usage/adding-languages#tag-map") tag map] + | is part of the vocabulary and defines the annotation scheme. If you're + | training a new language model, this will let you map the tags present in + | the treebank you train on to spaCy's tag scheme. +code. - from spacy.vocab import Vocab - from spacy.pipeline import EntityRecognizer - from spacy.tokens import Doc - - vocab = Vocab() - entity = EntityRecognizer(vocab, entity_types=['PERSON', 'LOC']) - - doc = Doc(vocab, words=['Who', 'is', 'Shaka', 'Khan', '?']) - entity.update(doc, ['O', 'O', 'B-PERSON', 'L-PERSON', 'O']) + doc = Doc(Vocab(), words=['Facebook', 'released', 'React', 'in', '2014']) + gold = GoldParse(doc, entities=['U-ORG', 'O', 'U-TECHNOLOGY', 'O', 'U-DATE']) p - +button(gh("spaCy", "examples/training/train_ner.py"), false, "secondary") Full example + | The same goes for named entities. The letters added before the labels + | refer to the tags of the + | #[+a("/docs/usage/entity-recognition#updating-biluo") BILUO scheme] – + | #[code O] is a token outside an entity, #[code U] an single entity unit, + | #[code B] the beginning of an entity, #[code I] a token inside an entity + | and #[code L] the last token of an entity. -+h(2, "extend-entity") Extending the named entity recognizer ++aside + | #[strong Training data]: The training examples.#[br] + | #[strong Text and label]: The current example.#[br] + | #[strong Doc]: A #[code Doc] object created from the example text.#[br] + | #[strong GoldParse]: A #[code GoldParse] object of the #[code Doc] and label.#[br] + | #[strong nlp]: The #[code nlp] object with the model.#[br] + | #[strong Optimizer]: A function that holds state between updates.#[br] + | #[strong Update]: Update the model's weights.#[br] + | #[strong ] + ++image + include ../../assets/img/docs/training-loop.svg + .u-text-right + +button("/assets/img/docs/training-loop.svg", false, "secondary").u-text-tag View large graphic p - | All #[+a("/docs/usage/models") spaCy models] support online learning, so - | you can update a pre-trained model with new examples. You can even add - | new classes to an existing model, to recognise a new entity type, - | part-of-speech, or syntactic relation. Updating an existing model is - | particularly useful as a "quick and dirty solution", if you have only a - | few corrections or annotations. + | Of course, it's not enough to only show a model a single example once. + | Especially if you only have few examples, you'll want to train for a + | #[strong number of iterations]. At each iteration, the training data is + | #[strong shuffled] to ensure the model doesn't make any generalisations + | based on the order of examples. Another technique to improve the learning + | results is to set a #[strong dropout rate], a rate at which to randomly + | "drop" individual features and representations. This makes it harder for + | the model to memorise the training data. For example, a #[code 0.25] + | dropout means that each feature or internal representation has a 1/4 + | likelihood of being dropped. -p.o-inline-list - +button(gh("spaCy", "examples/training/train_new_entity_type.py"), true, "secondary") Full example - +button("/docs/usage/training-ner", false, "secondary") Usage guide ++aside + | #[+api("language#begin_training") #[code begin_training()]]: Start the + | training and return an optimizer function to update the model's weights.#[br] + | #[+api("language#update") #[code update()]]: Update the model with the + | training example and gold data.#[br] + | #[+api("language#to_disk") #[code to_disk()]]: Save the updated model to + | a directory. -+h(2, "train-dependency") Training the dependency parser ++code("Example training loop"). + optimizer = nlp.begin_training(get_data) + for itn in range(100): + random.shuffle(train_data) + for raw_text, entity_offsets in train_data: + doc = nlp.make_doc(raw_text) + gold = GoldParse(doc, entities=entity_offsets) + nlp.update([doc], [gold], drop=0.5, sgd=optimizer) + nlp.to_disk('/model') -+code. - from spacy.vocab import Vocab - from spacy.pipeline import DependencyParser - from spacy.tokens import Doc ++table(["Name", "Description"]) + +row + +cell #[code train_data] + +cell The training data. - vocab = Vocab() - parser = DependencyParser(vocab, labels=['nsubj', 'compound', 'dobj', 'punct']) + +row + +cell #[code get_data] + +cell A function converting the training data to spaCy's JSON format. - doc = Doc(vocab, words=['Who', 'is', 'Shaka', 'Khan', '?']) - parser.update(doc, [(1, 'nsubj'), (1, 'ROOT'), (3, 'compound'), (1, 'dobj'), - (1, 'punct')]) + +row + +cell #[code doc] + +cell #[+api("doc") #[code Doc]] objects. -p - +button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example + +row + +cell #[code gold] + +cell #[+api("goldparse") #[code GoldParse]] objects. + + +row + +cell #[code drop] + +cell Dropout rate. Makes it harder for the model to just memorise the data. + + +row + +cell #[code optimizer] + +cell Callable to update the model's weights. + ++infobox + | For the #[strong full example and more details], see the usage guide on + | #[+a("/docs/usage/training-ner") training the named entity recognizer], + | or the runnable + | #[+src(gh("spaCy", "examples/training/train_ner.py")) training script] + | on GitHub. + ++h(2) Examples + ++under-construction From 03bbb96db8ec82f3a4a72d1ba66a44320d9b5d1c Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 1 Jun 2017 11:56:02 +0200 Subject: [PATCH 09/10] Remove outdated examples --- website/docs/usage/training-ner.jade | 107 +-------------------------- 1 file changed, 3 insertions(+), 104 deletions(-) diff --git a/website/docs/usage/training-ner.jade b/website/docs/usage/training-ner.jade index 5a0c06462..500bb24ff 100644 --- a/website/docs/usage/training-ner.jade +++ b/website/docs/usage/training-ner.jade @@ -8,6 +8,8 @@ p | particularly useful as a "quick and dirty solution", if you have only a | few corrections or annotations. ++under-construction + +h(2, "improving-accuracy") Improving accuracy on existing entity types p @@ -15,16 +17,7 @@ p | #[+api("goldparse") #[code spacy.gold.GoldParse]], with the entity labels | you want to learn. You will then pass this instance to the | #[+api("entityrecognizer#update") #[code EntityRecognizer.update()]] - | method. For example: - -+code. - import spacy - from spacy.gold import GoldParse - - nlp = spacy.load('en') - doc = nlp.make_doc(u'Facebook released React in 2014') - gold = GoldParse(doc, entities=['U-ORG', 'O', 'U-TECHNOLOGY', 'O', 'U-DATE']) - nlp.entity.update(doc, gold) + | method. p | You'll usually need to provide many examples to meaningfully improve the @@ -44,100 +37,6 @@ p | #[strong experiment on your own data] to find a solution that works best | for you. -+h(2, "adding") Adding a new entity type - -p - | You can add new entity types to an existing model. Let's say we want to - | recognise the category #[code TECHNOLOGY]. The new category will include - | programming languages, frameworks and platforms. First, we need to - | register the new entity type: - -+code. - nlp.entity.add_label('TECHNOLOGY') - -p - | Next, iterate over your examples, calling #[code entity.update()]. As - | above, we want to avoid iterating over only a small number of sentences. - | A useful compromise is to run the model over a number of plain-text - | sentences, and pass the entities to #[code GoldParse], as "true" - | annotations. This encourages the optimizer to find a solution that - | predicts the new category with minimal difference from the previous - | output. - -+h(2, "example") Example: Adding and training an #[code ANIMAL] entity - -+under-construction - -p - | This script shows how to add a new entity type to an existing pre-trained - | NER model. To keep the example short and simple, only four sentences are - | provided as examples. In practice, you'll need many more — - | #[strong a few hundred] would be a good start. You will also likely need - | to mix in #[strong examples of other entity types], which might be - | obtained by running the entity recognizer over unlabelled sentences, and - | adding their annotations to the training set. - -p - | For the full, runnable script of this example, see - | #[+src(gh("spacy", "examples/training/train_new_entity_type.py")) train_new_entity_type.py]. - -+code("Training the entity recognizer"). - import spacy - from spacy.pipeline import EntityRecognizer - from spacy.gold import GoldParse - from spacy.tagger import Tagger - import random - - model_name = 'en' - entity_label = 'ANIMAL' - output_directory = '/path/to/model' - train_data = [ - ("Horses are too tall and they pretend to care about your feelings", - [(0, 6, 'ANIMAL')]), - ("horses are too tall and they pretend to care about your feelings", - [(0, 6, 'ANIMAL')]), - ("horses pretend to care about your feelings", - [(0, 6, 'ANIMAL')]), - ("they pretend to care about your feelings, those horses", - [(48, 54, 'ANIMAL')]) - ] - - nlp = spacy.load(model_name) - nlp.entity.add_label(entity_label) - ner = train_ner(nlp, train_data, output_directory) - - def train_ner(nlp, train_data, output_dir): - # Add new words to vocab - for raw_text, _ in train_data: - doc = nlp.make_doc(raw_text) - for word in doc: - _ = nlp.vocab[word.orth] - - for itn in range(20): - random.shuffle(train_data) - for raw_text, entity_offsets in train_data: - gold = GoldParse(doc, entities=entity_offsets) - doc = nlp.make_doc(raw_text) - nlp.tagger(doc) - loss = nlp.entity.update(doc, gold) - nlp.save_to_directory(output_dir) - -p - +button(gh("spaCy", "examples/training/train_new_entity_type.py"), false, "secondary") Full example - -p - | The actual training is performed by looping over the examples, and - | calling #[code nlp.entity.update()]. The #[code update()] method steps - | through the words of the input. At each word, it makes a prediction. It - | then consults the annotations provided on the #[code GoldParse] instance, - | to see whether it was right. If it was wrong, it adjusts its weights so - | that the correct action will score higher next time. - -p - | After training your model, you can - | #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend - | wrapping models as Python packages, for ease of deployment. - +h(2, "saving-loading") Saving and loading p From 4a927154d83c8f618fe0a991cef87ef2b1caf1a0 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 1 Jun 2017 11:56:32 +0200 Subject: [PATCH 10/10] Update v2 docs --- website/docs/usage/v2.jade | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade index 75c8c2d3c..0d57a17b4 100644 --- a/website/docs/usage/v2.jade +++ b/website/docs/usage/v2.jade @@ -170,7 +170,7 @@ p python -m spacy download de # default German model python -m spacy download fr # default French model python -m spacy download es # default Spanish model - python -m spacy download xx_ent_web_md # multi-language NER + python -m spacy download xx_ent_wiki_sm # multi-language NER p | spaCy v2.0 comes with new and improved neural network models for English, @@ -294,9 +294,6 @@ p +h(2, "migrating") Migrating from spaCy 1.x p - | If you've mostly been using spaCy for basic text processing, chances are - | you won't even have to change your code at all. For all other cases, - | we've tried to focus... +infobox("Some tips") | Before migrating, we strongly recommend writing a few @@ -339,6 +336,11 @@ p nlp.save_to_directory('/model') nlp.vocab.dump('/vocab') +p + | If you've trained models with input from v1.x, you'll need to + | #[strong retrain them] with spaCy v2.0. All previous models will not + | be compatible with the new version. + +h(3, "migrating-strings") Strings and hash values p