mirror of https://github.com/explosion/spaCy.git
60 lines
2.1 KiB
Python
60 lines
2.1 KiB
Python
|
from __future__ import absolute_import
|
||
|
from __future__ import unicode_literals
|
||
|
|
||
|
import random
|
||
|
from .gold import GoldParse
|
||
|
from .scorer import Scorer
|
||
|
|
||
|
|
||
|
class Trainer(object):
|
||
|
def __init__(self, nlp, gold_tuples):
|
||
|
self.nlp = nlp
|
||
|
self.gold_tuples = gold_tuples
|
||
|
|
||
|
def epochs(self, nr_epoch, augment_data=None):
|
||
|
def _epoch():
|
||
|
for raw_text, paragraph_tuples in self.gold_tuples:
|
||
|
if augment_data is not None:
|
||
|
raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples)
|
||
|
docs = self.make_docs(raw_text, paragraph_tuples)
|
||
|
golds = self.make_golds(docs, paragraph_tuples)
|
||
|
for doc, gold in zip(docs, golds):
|
||
|
yield doc, gold
|
||
|
|
||
|
for itn in range(nr_epoch):
|
||
|
random.shuffle(self.gold_tuples)
|
||
|
yield _epoch()
|
||
|
|
||
|
def update(self, doc, gold):
|
||
|
for process in self.nlp.pipeline[1:]:
|
||
|
if hasattr(process, 'update'):
|
||
|
process.update(doc, gold)
|
||
|
process(doc)
|
||
|
return doc
|
||
|
|
||
|
def evaluate(self, dev_sents):
|
||
|
scorer = Scorer()
|
||
|
for raw_text, paragraph_tuples in dev_sents:
|
||
|
docs = self.make_docs(raw_text, paragraph_tuples)
|
||
|
golds = self.make_golds(docs, paragraph_tuples)
|
||
|
for doc, gold in zip(docs, golds):
|
||
|
for process in self.nlp.pipeline[1:]:
|
||
|
process(doc)
|
||
|
scorer.score(doc, gold)
|
||
|
return scorer
|
||
|
|
||
|
def make_docs(self, raw_text, paragraph_tuples):
|
||
|
if raw_text is not None:
|
||
|
return [self.nlp.tokenizer(raw_text)]
|
||
|
else:
|
||
|
return [self.nlp.tokenizer.tokens_from_list(sent_tuples[0][1])
|
||
|
for sent_tuples in paragraph_tuples]
|
||
|
|
||
|
def make_golds(self, docs, paragraph_tuples):
|
||
|
if len(docs) == 1:
|
||
|
return [GoldParse(docs[0], sent_tuples[0])
|
||
|
for sent_tuples in paragraph_tuples]
|
||
|
else:
|
||
|
return [GoldParse(doc, sent_tuples[0])
|
||
|
for doc, sent_tuples in zip(docs, paragraph_tuples)]
|