spaCy/bin/parser/train.py

#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals
from __future__ import print_function

import os
from os import path
import shutil
import io
import random

import plac
import re

import spacy.util

from spacy.syntax.util import Config
from spacy.gold import read_json_file
from spacy.gold import GoldParse
from spacy.gold import merge_sents

from spacy.scorer import Scorer

from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.ner import BiluoPushDown
from spacy.tagger import Tagger
from spacy.syntax.parser import Parser
from spacy.syntax.nonproj import PseudoProjectivity


def _corrupt(c, noise_level):
    if random.random() >= noise_level:
        return c
    elif c == ' ':
        return '\n'
    elif c == '\n':
        return ' '
    elif c in ['.', "'", "!", "?"]:
        return ''
    else:
        return c.lower()


def add_noise(orig, noise_level):
    if random.random() >= noise_level:
        return orig
    elif type(orig) == list:
        corrupted = [_corrupt(word, noise_level) for word in orig]
        corrupted = [w for w in corrupted if w]
        return corrupted
    else:
        return ''.join(_corrupt(c, noise_level) for c in orig)


def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
    if raw_text is None:
        tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
    else:
        tokens = nlp.tokenizer(raw_text)
    nlp.tagger(tokens)
    nlp.entity(tokens)
    nlp.parser(tokens)
    gold = GoldParse(tokens, annot_tuples)
    scorer.score(tokens, gold, verbose=verbose)


def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg,
        n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0):
    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    format_str = '{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
    with Language.train(model_dir, train_data,
            tagger_cfg, parser_cfg, entity_cfg) as trainer:
        loss = 0
        for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=gold_preproc,
                                                   augment_data=None)):
            for doc, gold in epoch:
                trainer.update(doc, gold)
            dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc)
            print(format_str.format(itn, loss, **dev_scores.scores))


def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
             beam_width=None, cand_preproc=None):
    nlp = Language(path=model_dir)
    if nlp.lang == 'de':
        nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
    if beam_width is not None:
        nlp.parser.cfg.beam_width = beam_width
    scorer = Scorer()
    for raw_text, sents in gold_tuples:
        if gold_preproc:
            raw_text = None
        else:
            sents = merge_sents(sents)
        for annot_tuples, brackets in sents:
            if raw_text is None:
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                nlp.parser(tokens)
                nlp.entity(tokens)
            else:
                tokens = nlp(raw_text)
            gold = GoldParse.from_annot_tuples(tokens, annot_tuples)
            scorer.score(tokens, gold, verbose=verbose)
    return scorer


def write_parses(Language, dev_loc, model_dir, out_loc):
    nlp = Language(data_dir=model_dir)
    gold_tuples = read_json_file(dev_loc)
    scorer = Scorer()
    out_file = io.open(out_loc, 'w', 'utf8')
    for raw_text, sents in gold_tuples:
        sents = _merge_sents(sents)
        for annot_tuples, brackets in sents:
            if raw_text is None:
                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                nlp.entity(tokens)
                nlp.parser(tokens)
            else:
                tokens = nlp(raw_text)
            #gold = GoldParse(tokens, annot_tuples)
            #scorer.score(tokens, gold, verbose=False)
            for sent in tokens.sents:
                for t in sent:
                    if not t.is_space:
                        out_file.write(
                            '%d\t%s\t%s\t%s\t%s\n' % (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_)
                        )
                out_file.write('\n')


@plac.annotations(
    language=("The language to train", "positional", None, str, ['en','de', 'zh']),
    train_loc=("Location of training file or directory"),
    dev_loc=("Location of development file or directory"),
    model_dir=("Location of output model directory",),
    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
    corruption_level=("Amount of noise to add to training data", "option", "c", float),
    gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
    out_loc=("Out location", "option", "o", str),
    n_sents=("Number of training sentences", "option", "n", int),
    n_iter=("Number of training iterations", "option", "i", int),
    verbose=("Verbose error reporting", "flag", "v", bool),
    debug=("Debug mode", "flag", "d", bool),
    pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool),
)
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
         debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False):
    parser_cfg = dict(locals())
    tagger_cfg = dict(locals())
    entity_cfg = dict(locals())

    lang = spacy.util.get_lang_class(language)
    
    parser_cfg['features'] = lang.Defaults.parser_features
    entity_cfg['features'] = lang.Defaults.entity_features

    if not eval_only:
        gold_train = list(read_json_file(train_loc))
        gold_dev = list(read_json_file(dev_loc))
        train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,
              n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,
              n_iter=n_iter)
    if out_loc:
        write_parses(lang, dev_loc, model_dir, out_loc)
    scorer = evaluate(lang, list(read_json_file(dev_loc)),
                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
    print('TOK', scorer.token_acc)
    print('POS', scorer.tags_acc)
    print('UAS', scorer.uas)
    print('LAS', scorer.las)

    print('NER P', scorer.ents_p)
    print('NER R', scorer.ents_r)
    print('NER F', scorer.ents_f)


if __name__ == '__main__':
    plac.call(main)
* Add parser training script 2015-01-09 17:53:26 +00:00			`#!/usr/bin/env python`
			`from __future__ import division`
			`from __future__ import unicode_literals`
* Use print function in train.py, for py 2/3 compatibility 2015-07-24 02:52:35 +00:00			`from __future__ import print_function`
* Add parser training script 2015-01-09 17:53:26 +00:00
			`import os`
			`from os import path`
			`import shutil`
caught another codecs.open 2015-09-30 18:16:52 +00:00			`import io`
* Add parser training script 2015-01-09 17:53:26 +00:00			`import random`

			`import plac`
* Tmp commit of train, while I move to better alignment in gold standard 2015-05-23 15:21:25 +00:00			`import re`
* Add parser training script 2015-01-09 17:53:26 +00:00
			`import spacy.util`

			`from spacy.syntax.util import Config`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 19:35:02 +00:00			`from spacy.gold import read_json_file`
			`from spacy.gold import GoldParse`
Update train.py 2016-10-13 01:23:48 +00:00			`from spacy.gold import merge_sents`
* Add parser training script 2015-01-09 17:53:26 +00:00
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`from spacy.scorer import Scorer`

* Update train.py for language-generic spaCy 2015-09-06 15:51:48 +00:00			`from spacy.syntax.arc_eager import ArcEager`
			`from spacy.syntax.ner import BiluoPushDown`
			`from spacy.tagger import Tagger`
			`from spacy.syntax.parser import Parser`
adjust train.py to train both english and german models 2016-03-03 14:21:00 +00:00			`from spacy.syntax.nonproj import PseudoProjectivity`
* Update train.py for language-generic spaCy 2015-09-06 15:51:48 +00:00
* Add parser training script 2015-01-09 17:53:26 +00:00
* Update input corruption method to work with lists as well as trings 2015-06-05 17:33:32 +00:00			`def _corrupt(c, noise_level):`
* Tmp commit 2015-05-24 00:50:14 +00:00			`if random.random() >= noise_level:`
			`return c`
			`elif c == ' ':`
			`return '\n'`
			`elif c == '\n':`
			`return ' '`
			`elif c in ['.', "'", "!", "?"]:`
			`return ''`
			`else:`
			`return c.lower()`


* Update input corruption method to work with lists as well as trings 2015-06-05 17:33:32 +00:00			`def add_noise(orig, noise_level):`
			`if random.random() >= noise_level:`
			`return orig`
			`elif type(orig) == list:`
			`corrupted = [_corrupt(word, noise_level) for word in orig]`
			`corrupted = [w for w in corrupted if w]`
			`return corrupted`
			`else:`
			`return ''.join(_corrupt(c, noise_level) for c in orig)`


* Temporarily disable NER, and wire up the verbose flag during training 2015-06-14 15:45:31 +00:00			`def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):`
* Update train.py, to support paragraphs where there's no raw_text 2015-05-27 17:14:02 +00:00			`if raw_text is None:`
			`tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])`
			`else:`
* Fix bug in train.py 2015-05-31 04:49:06 +00:00			`tokens = nlp.tokenizer(raw_text)`
* Fix train.py 2015-06-05 13:50:24 +00:00			`nlp.tagger(tokens)`
* Allow parser to jackknife POS tags before training. 2015-05-30 23:11:11 +00:00			`nlp.entity(tokens)`
			`nlp.parser(tokens)`
* Update train.py, to support paragraphs where there's no raw_text 2015-05-27 17:14:02 +00:00			`gold = GoldParse(tokens, annot_tuples)`
* Temporarily disable NER, and wire up the verbose flag during training 2015-06-14 15:45:31 +00:00			`scorer.score(tokens, gold, verbose=verbose)`
* Update train.py, to support paragraphs where there's no raw_text 2015-05-27 17:14:02 +00:00

Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 10:24:24 +00:00			`def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg,`
Fix train.py for v1.0.0-rc1 2016-10-04 23:11:46 +00:00			`n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0):`
* Use print function in train.py, for py 2/3 compatibility 2015-07-24 02:52:35 +00:00			`print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 10:24:24 +00:00			`format_str = '{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'`
			`with Language.train(model_dir, train_data,`
			`tagger_cfg, parser_cfg, entity_cfg) as trainer:`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 19:35:02 +00:00			`loss = 0`
Update train.py 2016-10-13 01:23:48 +00:00			`for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=gold_preproc,`
			`augment_data=None)):`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 10:24:24 +00:00			`for doc, gold in epoch:`
			`trainer.update(doc, gold)`
Update train.py 2016-10-13 01:23:48 +00:00			`dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc)`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 10:24:24 +00:00			`print(format_str.format(itn, loss, **dev_scores.scores))`
* Add parser training script 2015-01-09 17:53:26 +00:00
* Update bin/parser/train for printing output. 2015-10-05 23:35:22 +00:00
* Add more options to bin/parser/train 2015-06-05 21:49:26 +00:00			`def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,`
* Update bin/parser/train for printing output. 2015-10-05 23:35:22 +00:00			`beam_width=None, cand_preproc=None):`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 10:24:24 +00:00			`nlp = Language(path=model_dir)`
adjust train.py to train both english and german models 2016-03-03 14:21:00 +00:00			`if nlp.lang == 'de':`
			`nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])`
* Add more options to bin/parser/train 2015-06-05 21:49:26 +00:00			`if beam_width is not None:`
			`nlp.parser.cfg.beam_width = beam_width`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`scorer = Scorer()`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-29 23:25:46 +00:00			`for raw_text, sents in gold_tuples:`
* Fix gold_preproc flag in train.py 2015-05-30 03:23:02 +00:00			`if gold_preproc:`
			`raw_text = None`
			`else:`
Update train.py 2016-10-13 01:23:48 +00:00			`sents = merge_sents(sents)`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-29 23:25:46 +00:00			`for annot_tuples, brackets in sents:`
* Fix gold_preproc flag in train.py 2015-05-30 03:23:02 +00:00			`if raw_text is None:`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-29 23:25:46 +00:00			`tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])`
			`nlp.tagger(tokens)`
			`nlp.parser(tokens)`
* Train after parsing, not before. 2015-11-11 17:43:52 +00:00			`nlp.entity(tokens)`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-29 23:25:46 +00:00			`else:`
* Update bin/parser/train for printing output. 2015-10-05 23:35:22 +00:00			`tokens = nlp(raw_text)`
Fix train script for 1.0 2016-11-25 14:57:37 +00:00			`gold = GoldParse.from_annot_tuples(tokens, annot_tuples)`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-29 23:25:46 +00:00			`scorer.score(tokens, gold, verbose=verbose)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`return scorer`
* Allow gold tokenization training, for debugging 2015-03-08 05:17:12 +00:00

* Update bin/parser/train for printing output. 2015-10-05 23:35:22 +00:00			`def write_parses(Language, dev_loc, model_dir, out_loc):`
* Fix write_parses mode of bin/parser/train.py 2015-06-07 17:08:48 +00:00			`nlp = Language(data_dir=model_dir)`
			`gold_tuples = read_json_file(dev_loc)`
* Add write_parses function 2015-03-20 00:14:20 +00:00			`scorer = Scorer()`
* Use io module insteads of deprecated codecs module 2015-10-10 03:13:01 +00:00			`out_file = io.open(out_loc, 'w', 'utf8')`
* Fix write_parses mode of bin/parser/train.py 2015-06-07 17:08:48 +00:00			`for raw_text, sents in gold_tuples:`
			`sents = _merge_sents(sents)`
			`for annot_tuples, brackets in sents:`
			`if raw_text is None:`
			`tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])`
			`nlp.tagger(tokens)`
* Uncomment NER training 2015-06-16 21:36:54 +00:00			`nlp.entity(tokens)`
* Fix write_parses mode of bin/parser/train.py 2015-06-07 17:08:48 +00:00			`nlp.parser(tokens)`
			`else:`
* Update bin/parser/train for printing output. 2015-10-05 23:35:22 +00:00			`tokens = nlp(raw_text)`
			`#gold = GoldParse(tokens, annot_tuples)`
			`#scorer.score(tokens, gold, verbose=False)`
			`for sent in tokens.sents:`
			`for t in sent:`
			`if not t.is_space:`
			`out_file.write(`
			`'%d\t%s\t%s\t%s\t%s\n' % (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_)`
			`)`
			`out_file.write('\n')`
* Add write_parses function 2015-03-20 00:14:20 +00:00

* Tmp commit 2015-02-23 19:05:04 +00:00			`@plac.annotations(`
* Add initial stuff for Chinese parsing 2016-04-24 16:44:24 +00:00			`language=("The language to train", "positional", None, str, ['en','de', 'zh']),`
* Read json files recursively from a directory, instead of requiring a single .json file 2015-05-29 01:52:55 +00:00			`train_loc=("Location of training file or directory"),`
			`dev_loc=("Location of development file or directory"),`
* Add more options to bin/parser/train 2015-06-05 21:49:26 +00:00			`model_dir=("Location of output model directory",),`
			`eval_only=("Skip training, and only evaluate", "flag", "e", bool),`
* Tmp commit 2015-05-24 00:50:14 +00:00			`corruption_level=("Amount of noise to add to training data", "option", "c", float),`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-29 23:25:46 +00:00			`gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),`
* Add write_parses function 2015-03-20 00:14:20 +00:00			`out_loc=("Out location", "option", "o", str),`
* Move to fixing up ent_strings and dep_strings passing 2015-03-14 15:09:55 +00:00			`n_sents=("Number of training sentences", "option", "n", int),`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 19:35:02 +00:00			`n_iter=("Number of training iterations", "option", "i", int),`
* Move to fixing up ent_strings and dep_strings passing 2015-03-14 15:09:55 +00:00			`verbose=("Verbose error reporting", "flag", "v", bool),`
* Add toggle for OrigArcEager system 2015-06-14 18:28:14 +00:00			`debug=("Debug mode", "flag", "d", bool),`
adjust train.py to train both english and german models 2016-03-03 14:21:00 +00:00			`pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool),`
* Tmp commit 2015-02-23 19:05:04 +00:00			`)`
adjust train.py to train both english and german models 2016-03-03 14:21:00 +00:00			`def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,`
			`debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False):`
Fix train.py for v1.0.0-rc1 2016-10-04 23:11:46 +00:00			`parser_cfg = dict(locals())`
			`tagger_cfg = dict(locals())`
			`entity_cfg = dict(locals())`

* Add initial stuff for Chinese parsing 2016-04-24 16:44:24 +00:00			`lang = spacy.util.get_lang_class(language)`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 10:24:24 +00:00
			`parser_cfg['features'] = lang.Defaults.parser_features`
			`entity_cfg['features'] = lang.Defaults.entity_features`
adjust train.py to train both english and german models 2016-03-03 14:21:00 +00:00
* Add more options to bin/parser/train 2015-06-05 21:49:26 +00:00			`if not eval_only:`
			`gold_train = list(read_json_file(train_loc))`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 10:24:24 +00:00			`gold_dev = list(read_json_file(dev_loc))`
			`train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,`
Fix train.py for v1.0.0-rc1 2016-10-04 23:11:46 +00:00			`n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,`
			`n_iter=n_iter)`
* Update bin/parser/train for printing output. 2015-10-05 23:35:22 +00:00			`if out_loc:`
adjust train.py to train both english and german models 2016-03-03 14:21:00 +00:00			`write_parses(lang, dev_loc, model_dir, out_loc)`
			`scorer = evaluate(lang, list(read_json_file(dev_loc)),`
* Fix redundant options in train.py 2015-07-17 20:38:05 +00:00			`model_dir, gold_preproc=gold_preproc, verbose=verbose)`
* Use print function in train.py, for py 2/3 compatibility 2015-07-24 02:52:35 +00:00			`print('TOK', scorer.token_acc)`
			`print('POS', scorer.tags_acc)`
			`print('UAS', scorer.uas)`
			`print('LAS', scorer.las)`

			`print('NER P', scorer.ents_p)`
			`print('NER R', scorer.ents_r)`
			`print('NER F', scorer.ents_f)`
Remove trailing whitespace 2015-04-19 08:31:31 +00:00
* Add parser training script 2015-01-09 17:53:26 +00:00
			`if __name__ == '__main__':`
			`plac.call(main)`