spaCy/spacy/train.py

from __future__ import absolute_import
from __future__ import unicode_literals

import random
from .gold import GoldParse
from .scorer import Scorer
from .gold import merge_sents


class Trainer(object):
    '''Manage training of an NLP pipeline.'''
    def __init__(self, nlp, gold_tuples):
        self.nlp = nlp
        self.gold_tuples = gold_tuples

    def epochs(self, nr_epoch, augment_data=None, gold_preproc=False):
        def _epoch():
            for raw_text, paragraph_tuples in self.gold_tuples:
                if gold_preproc:
                    raw_text = None
                else:
                    paragraph_tuples = merge_sents(paragraph_tuples)
                if augment_data is not None:
                    raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples)
                docs = self.make_docs(raw_text, paragraph_tuples)
                golds = self.make_golds(docs, paragraph_tuples)
                for doc, gold in zip(docs, golds):
                    yield doc, gold

        for itn in range(nr_epoch):
            random.shuffle(self.gold_tuples)
            yield _epoch()
 
    def update(self, doc, gold):
        for process in self.nlp.pipeline:
            if hasattr(process, 'update'):
                process.update(doc, gold)
            process(doc)
        return doc

    def evaluate(self, dev_sents, gold_preproc=False):
        scorer = Scorer()
        for raw_text, paragraph_tuples in dev_sents:
            if gold_preproc:
                raw_text = None
            else:
                paragraph_tuples = merge_sents(paragraph_tuples)
            docs = self.make_docs(raw_text, paragraph_tuples)
            golds = self.make_golds(docs, paragraph_tuples)
            for doc, gold in zip(docs, golds):
                for process in self.nlp.pipeline[1:]:
                    process(doc)
                scorer.score(doc, gold)
        return scorer

    def make_docs(self, raw_text, paragraph_tuples):
        if raw_text is not None:
            return [self.nlp.tokenizer(raw_text)]
        else:
            return [self.nlp.tokenizer.tokens_from_list(sent_tuples[0][1])
                    for sent_tuples in paragraph_tuples]

    def make_golds(self, docs, paragraph_tuples):
        if len(docs) == 1:
            return [GoldParse(docs[0], sent_tuples[0])
                    for sent_tuples in paragraph_tuples]
        else:
            return [GoldParse(doc, sent_tuples[0])
                    for doc, sent_tuples in zip(docs, paragraph_tuples)]
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 10:24:24 +00:00			`from __future__ import absolute_import`
			`from __future__ import unicode_literals`

			`import random`
			`from .gold import GoldParse`
			`from .scorer import Scorer`
Update train method 2016-10-13 01:24:53 +00:00			`from .gold import merge_sents`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 10:24:24 +00:00

			`class Trainer(object):`
Add docstring for Trainer class. 2016-10-12 12:26:02 +00:00			`'''Manage training of an NLP pipeline.'''`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 10:24:24 +00:00			`def __init__(self, nlp, gold_tuples):`
			`self.nlp = nlp`
			`self.gold_tuples = gold_tuples`

Update train method 2016-10-13 01:24:53 +00:00			`def epochs(self, nr_epoch, augment_data=None, gold_preproc=False):`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 10:24:24 +00:00			`def _epoch():`
			`for raw_text, paragraph_tuples in self.gold_tuples:`
Update train method 2016-10-13 01:24:53 +00:00			`if gold_preproc:`
			`raw_text = None`
			`else:`
			`paragraph_tuples = merge_sents(paragraph_tuples)`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 10:24:24 +00:00			`if augment_data is not None:`
			`raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples)`
			`docs = self.make_docs(raw_text, paragraph_tuples)`
			`golds = self.make_golds(docs, paragraph_tuples)`
			`for doc, gold in zip(docs, golds):`
			`yield doc, gold`

			`for itn in range(nr_epoch):`
			`random.shuffle(self.gold_tuples)`
			`yield _epoch()`

			`def update(self, doc, gold):`
Fix spacy.train 2016-10-15 21:53:46 +00:00			`for process in self.nlp.pipeline:`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 10:24:24 +00:00			`if hasattr(process, 'update'):`
			`process.update(doc, gold)`
			`process(doc)`
			`return doc`

Update train method 2016-10-13 01:24:53 +00:00			`def evaluate(self, dev_sents, gold_preproc=False):`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 10:24:24 +00:00			`scorer = Scorer()`
			`for raw_text, paragraph_tuples in dev_sents:`
Update train method 2016-10-13 01:24:53 +00:00			`if gold_preproc:`
			`raw_text = None`
			`else:`
			`paragraph_tuples = merge_sents(paragraph_tuples)`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 10:24:24 +00:00			`docs = self.make_docs(raw_text, paragraph_tuples)`
			`golds = self.make_golds(docs, paragraph_tuples)`
			`for doc, gold in zip(docs, golds):`
			`for process in self.nlp.pipeline[1:]:`
			`process(doc)`
			`scorer.score(doc, gold)`
			`return scorer`

			`def make_docs(self, raw_text, paragraph_tuples):`
			`if raw_text is not None:`
			`return [self.nlp.tokenizer(raw_text)]`
			`else:`
			`return [self.nlp.tokenizer.tokens_from_list(sent_tuples[0][1])`
			`for sent_tuples in paragraph_tuples]`

			`def make_golds(self, docs, paragraph_tuples):`
			`if len(docs) == 1:`
			`return [GoldParse(docs[0], sent_tuples[0])`
			`for sent_tuples in paragraph_tuples]`
			`else:`
			`return [GoldParse(doc, sent_tuples[0])`
			`for doc, sent_tuples in zip(docs, paragraph_tuples)]`