Update training section in NER guide and add links

This commit is contained in:
ines 2017-06-01 11:52:49 +02:00
parent d5c8d2f5fd
commit 72380c952a
1 changed files with 15 additions and 26 deletions

View File

@ -154,40 +154,29 @@ p
| To provide training examples to the entity recogniser, you'll first need | To provide training examples to the entity recogniser, you'll first need
| to create an instance of the #[+api("goldparse") #[code GoldParse]] class. | to create an instance of the #[+api("goldparse") #[code GoldParse]] class.
| You can specify your annotations in a stand-off format or as token tags. | You can specify your annotations in a stand-off format or as token tags.
+code.
import random
import spacy
from spacy.gold import GoldParse
from spacy.pipeline import EntityRecognizer
train_data = [('Who is Chaka Khan?', [(7, 17, 'PERSON')]),
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])]
nlp = spacy.load('en', entity=False, parser=False)
ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC'])
for itn in range(5):
random.shuffle(train_data)
for raw_text, entity_offsets in train_data:
doc = nlp.make_doc(raw_text)
gold = GoldParse(doc, entities=entity_offsets)
nlp.tagger(doc)
ner.update(doc, gold)
p
| If a character offset in your entity annotations don't fall on a token | If a character offset in your entity annotations don't fall on a token
| boundary, the #[code GoldParse] class will treat that annotation as a | boundary, the #[code GoldParse] class will treat that annotation as a
| missing value. This allows for more realistic training, because the | missing value. This allows for more realistic training, because the
| entity recogniser is allowed to learn from examples that may feature | entity recogniser is allowed to learn from examples that may feature
| tokenizer errors. | tokenizer errors.
+aside-code("Example"). +code.
train_data = [('Who is Chaka Khan?', [(7, 17, 'PERSON')]),
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])]
+code.
doc = Doc(nlp.vocab, [u'rats', u'make', u'good', u'pets']) doc = Doc(nlp.vocab, [u'rats', u'make', u'good', u'pets'])
gold = GoldParse(doc, [u'U-ANIMAL', u'O', u'O', u'O']) gold = GoldParse(doc, [u'U-ANIMAL', u'O', u'O', u'O'])
ner = EntityRecognizer(nlp.vocab, entity_types=['ANIMAL'])
ner.update(doc, gold) +infobox
| For more details on #[strong training and updating] the named entity
| recognizer, see the usage guides on #[+a("/docs/usage/training") training]
| and #[+a("/docs/usage/training-ner") training the named entity recognizer],
| or check out the runnable
| #[+src(gh("spaCy", "examples/training/train_ner.py")) training script]
| on GitHub.
+h(3, "updating-biluo") The BILUO Scheme
p p
| You can also provide token-level entity annotation, using the | You can also provide token-level entity annotation, using the