spaCy/examples/train_ner_standalone.py

'''Example of training a named entity recognition system from scratch using spaCy

This example is written to be self-contained and reasonably transparent.
To achieve that, it duplicates some of spaCy's internal functionality.

Specifically, in this example, we don't use spaCy's built-in Language class to
wire together the Vocab, Tokenizer and EntityRecognizer. Instead, we write
our own simle Pipeline class, so that it's easier to see how the pieces
interact.

Input data:
https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip

Developed for: spaCy 1.7.1
Last tested for: spaCy 1.7.1
'''
from __future__ import unicode_literals, print_function
import plac
from pathlib import Path
import random
import json

import spacy.orth as orth_funcs
from spacy.vocab import Vocab
from spacy.pipeline import BeamEntityRecognizer
from spacy.pipeline import EntityRecognizer
from spacy.tokenizer import Tokenizer
from spacy.tokens import Doc
from spacy.attrs import *
from spacy.gold import GoldParse
from spacy.gold import _iob_to_biluo as iob_to_biluo
from spacy.scorer import Scorer

try:
    unicode
except NameError:
    unicode = str


def init_vocab():
    return Vocab(
        lex_attr_getters={
            LOWER: lambda string: string.lower(),
            SHAPE: orth_funcs.word_shape,
            PREFIX: lambda string: string[0],
            SUFFIX: lambda string: string[-3:],
            CLUSTER: lambda string: 0,
            IS_ALPHA: orth_funcs.is_alpha,
            IS_ASCII: orth_funcs.is_ascii,
            IS_DIGIT: lambda string: string.isdigit(),
            IS_LOWER: orth_funcs.is_lower,
            IS_PUNCT: orth_funcs.is_punct,
            IS_SPACE: lambda string: string.isspace(),
            IS_TITLE: orth_funcs.is_title,
            IS_UPPER: orth_funcs.is_upper,
            IS_STOP: lambda string: False,
            IS_OOV: lambda string: True
        })


def save_vocab(vocab, path):
    path = Path(path)
    if not path.exists():
        path.mkdir()
    elif not path.is_dir():
        raise IOError("Can't save vocab to %s\nNot a directory" % path)
    with (path / 'strings.json').open('w') as file_:
        vocab.strings.dump(file_)
    vocab.dump((path / 'lexemes.bin').as_posix())


def load_vocab(path):
    path = Path(path)
    if not path.exists():
        raise IOError("Cannot load vocab from %s\nDoes not exist" % path)
    if not path.is_dir():
        raise IOError("Cannot load vocab from %s\nNot a directory" % path)
    return Vocab.load(path)


def init_ner_model(vocab, features=None):
    if features is None:
        features = tuple(EntityRecognizer.feature_templates)
    return BeamEntityRecognizer(vocab, features=features)


def save_ner_model(model, path):
    path = Path(path)
    if not path.exists():
        path.mkdir()
    if not path.is_dir():
        raise IOError("Can't save model to %s\nNot a directory" % path)
    model.model.dump((path / 'model').as_posix())
    with (path / 'config.json').open('w') as file_:
        data = json.dumps(model.cfg)
        if not isinstance(data, unicode):
            data = data.decode('utf8')
        file_.write(data)


def load_ner_model(vocab, path):
    return BeamEntityRecognizer.load(path, vocab)


class Pipeline(object):
    @classmethod
    def load(cls, path):
        path = Path(path)
        if not path.exists():
            raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
        if not path.is_dir():
            raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
        vocab = load_vocab(path / 'vocab')
        tokenizer = Tokenizer(vocab, {}, None, None, None)
        ner_model = load_ner_model(vocab, path / 'ner')
        return cls(vocab, tokenizer, ner_model)

    def __init__(self, vocab=None, tokenizer=None, ner_model=None):
        if vocab is None:
            self.vocab = init_vocab()
        if tokenizer is None:
            tokenizer = Tokenizer(vocab, {}, None, None, None)
        if ner_model is None:
            self.entity = init_ner_model(self.vocab)
        self.pipeline = [self.entity]

    def __call__(self, input_):
        doc = self.make_doc(input_)
        for process in self.pipeline:
            process(doc)
        return doc

    def make_doc(self, input_):
        if isinstance(input_, bytes):
            input_ = input_.decode('utf8')
        if isinstance(input_, unicode):
            return self.tokenizer(input_)
        else:
            return Doc(self.vocab, words=input_)

    def make_gold(self, input_, annotations):
        doc = self.make_doc(input_)
        gold = GoldParse(doc, entities=annotations)
        return gold

    def update(self, input_, annot):
        doc = self.make_doc(input_)
        gold = self.make_gold(input_, annot)
        for ner in gold.ner:
            if ner not in (None, '-', 'O'):
                action, label = ner.split('-', 1)
                self.entity.add_label(label)
        return self.entity.update(doc, gold)

    def evaluate(self, examples):
        scorer = Scorer()
        for input_, annot in examples:
            gold = self.make_gold(input_, annot)
            doc = self(input_)
            scorer.score(doc, gold)
        return scorer.scores

    def average_weights(self):
        self.entity.model.end_training()

    def save(self, path):
        path = Path(path)
        if not path.exists():
            path.mkdir()
        elif not path.is_dir():
            raise IOError("Can't save pipeline to %s\nNot a directory" % path)
        save_vocab(self.vocab, path / 'vocab')
        save_ner_model(self.entity, path / 'ner')


def train(nlp, train_examples, dev_examples, nr_epoch=5):
    next_epoch = train_examples
    print("Iter", "Loss", "P", "R", "F")
    for i in range(nr_epoch):
        this_epoch = next_epoch
        next_epoch = []
        loss = 0
        for input_, annot in this_epoch:
            loss += nlp.update(input_, annot)
            if (i+1) < nr_epoch:
                next_epoch.append((input_, annot))
        random.shuffle(next_epoch)
        scores = nlp.evaluate(dev_examples)
        precision = '%.2f' % scores['ents_p']
        recall = '%.2f' % scores['ents_r']
        f_measure = '%.2f' % scores['ents_f']
        print(i, int(loss), precision, recall, f_measure)
    nlp.average_weights()
    scores = nlp.evaluate(dev_examples)
    print("After averaging")
    print(scores['ents_p'], scores['ents_r'], scores['ents_f'])


def read_examples(path):
    path = Path(path)
    with path.open() as file_:
        sents = file_.read().strip().split('\n\n')
        for sent in sents:
            if not sent.strip():
                continue
            tokens = sent.split('\n')
            while tokens and tokens[0].startswith('#'):
                tokens.pop(0)
            words = []
            iob = []
            for token in tokens:
                if token.strip():
                    pieces = token.split()
                    words.append(pieces[1])
                    iob.append(pieces[2])
            yield words, iob_to_biluo(iob)


@plac.annotations(
    model_dir=("Path to save the model", "positional", None, Path),
    train_loc=("Path to your training data", "positional", None, Path),
    dev_loc=("Path to your development data", "positional", None, Path),
)
def main(model_dir, train_loc, dev_loc, nr_epoch=10):
    train_examples = read_examples(train_loc)
    dev_examples = read_examples(dev_loc)
    nlp = Pipeline()

    train(nlp, train_examples, list(dev_examples), nr_epoch)

    nlp.save(model_dir)


if __name__ == '__main__':
    plac.call(main)