mirror of https://github.com/explosion/spaCy.git
Update training section in NER guide and add links
This commit is contained in:
parent
d5c8d2f5fd
commit
72380c952a
|
@ -154,40 +154,29 @@ p
|
||||||
| To provide training examples to the entity recogniser, you'll first need
|
| To provide training examples to the entity recogniser, you'll first need
|
||||||
| to create an instance of the #[+api("goldparse") #[code GoldParse]] class.
|
| to create an instance of the #[+api("goldparse") #[code GoldParse]] class.
|
||||||
| You can specify your annotations in a stand-off format or as token tags.
|
| You can specify your annotations in a stand-off format or as token tags.
|
||||||
|
|
||||||
+code.
|
|
||||||
import random
|
|
||||||
import spacy
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
from spacy.pipeline import EntityRecognizer
|
|
||||||
|
|
||||||
train_data = [('Who is Chaka Khan?', [(7, 17, 'PERSON')]),
|
|
||||||
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])]
|
|
||||||
|
|
||||||
nlp = spacy.load('en', entity=False, parser=False)
|
|
||||||
ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC'])
|
|
||||||
|
|
||||||
for itn in range(5):
|
|
||||||
random.shuffle(train_data)
|
|
||||||
for raw_text, entity_offsets in train_data:
|
|
||||||
doc = nlp.make_doc(raw_text)
|
|
||||||
gold = GoldParse(doc, entities=entity_offsets)
|
|
||||||
|
|
||||||
nlp.tagger(doc)
|
|
||||||
ner.update(doc, gold)
|
|
||||||
|
|
||||||
p
|
|
||||||
| If a character offset in your entity annotations don't fall on a token
|
| If a character offset in your entity annotations don't fall on a token
|
||||||
| boundary, the #[code GoldParse] class will treat that annotation as a
|
| boundary, the #[code GoldParse] class will treat that annotation as a
|
||||||
| missing value. This allows for more realistic training, because the
|
| missing value. This allows for more realistic training, because the
|
||||||
| entity recogniser is allowed to learn from examples that may feature
|
| entity recogniser is allowed to learn from examples that may feature
|
||||||
| tokenizer errors.
|
| tokenizer errors.
|
||||||
|
|
||||||
+aside-code("Example").
|
+code.
|
||||||
|
train_data = [('Who is Chaka Khan?', [(7, 17, 'PERSON')]),
|
||||||
|
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])]
|
||||||
|
|
||||||
|
+code.
|
||||||
doc = Doc(nlp.vocab, [u'rats', u'make', u'good', u'pets'])
|
doc = Doc(nlp.vocab, [u'rats', u'make', u'good', u'pets'])
|
||||||
gold = GoldParse(doc, [u'U-ANIMAL', u'O', u'O', u'O'])
|
gold = GoldParse(doc, [u'U-ANIMAL', u'O', u'O', u'O'])
|
||||||
ner = EntityRecognizer(nlp.vocab, entity_types=['ANIMAL'])
|
|
||||||
ner.update(doc, gold)
|
+infobox
|
||||||
|
| For more details on #[strong training and updating] the named entity
|
||||||
|
| recognizer, see the usage guides on #[+a("/docs/usage/training") training]
|
||||||
|
| and #[+a("/docs/usage/training-ner") training the named entity recognizer],
|
||||||
|
| or check out the runnable
|
||||||
|
| #[+src(gh("spaCy", "examples/training/train_ner.py")) training script]
|
||||||
|
| on GitHub.
|
||||||
|
|
||||||
|
+h(3, "updating-biluo") The BILUO Scheme
|
||||||
|
|
||||||
p
|
p
|
||||||
| You can also provide token-level entity annotation, using the
|
| You can also provide token-level entity annotation, using the
|
||||||
|
|
Loading…
Reference in New Issue