spaCy/bin/parser/train.py

#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals

import os
from os import path
import shutil
import codecs
import random

import plac
import cProfile
import pstats

import spacy.util
from spacy.en import English
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir

from spacy.syntax.parser import GreedyParser
from spacy.syntax.parser import OracleError
from spacy.syntax.util import Config
from spacy.syntax.conll import read_docparse_file
from spacy.syntax.conll import GoldParse

from spacy.scorer import Scorer


def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
          gold_preproc=False, n_sents=0):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    ner_model_dir = path.join(model_dir, 'ner')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    if path.exists(ner_model_dir):
        shutil.rmtree(ner_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)
    os.mkdir(ner_model_dir)

    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)

    gold_tuples = read_docparse_file(train_loc)

    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 labels=Language.ParserTransitionSystem.get_labels(gold_tuples))
    Config.write(ner_model_dir, 'config', features='ner', seed=seed,
                 labels=Language.EntityTransitionSystem.get_labels(gold_tuples))

    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]
    nlp = Language(data_dir=model_dir)

    print "Itn.\tUAS\tNER F.\tTag %"
    for itn in range(n_iter):
        scorer = Scorer()
        for raw_text, segmented_text, annot_tuples in gold_tuples:
            # Eval before train
            tokens = nlp(raw_text, merge_mwes=False)
            gold = GoldParse(tokens, annot_tuples)
            scorer.score(tokens, gold, verbose=False)

            if gold_preproc:
                sents = [nlp.tokenizer.tokens_from_list(s) for s in segmented_text]
            else:
                sents = [nlp.tokenizer(raw_text)]
            for tokens in sents:
                gold = GoldParse(tokens, annot_tuples)
                nlp.tagger(tokens)
                nlp.parser.train(tokens, gold)
                if gold.ents:
                    nlp.entity.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)

        print '%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.ents_f, scorer.tags_acc)
        random.shuffle(gold_tuples)
    nlp.parser.model.end_training()
    nlp.entity.model.end_training()
    nlp.tagger.model.end_training()
    nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))


def evaluate(Language, dev_loc, model_dir, gold_preproc=False, verbose=True):
    assert not gold_preproc
    nlp = Language(data_dir=model_dir)
    gold_tuples = read_docparse_file(dev_loc)
    scorer = Scorer()
    for raw_text, segmented_text, annot_tuples in gold_tuples:
        tokens = nlp(raw_text, merge_mwes=False)
        gold = GoldParse(tokens, annot_tuples)
        scorer.score(tokens, gold, verbose=verbose)
    return scorer


def write_parses(Language, dev_loc, model_dir, out_loc):
    nlp = Language()
    gold_tuples = read_docparse_file(dev_loc)
    scorer = Scorer()
    out_file = codecs.open(out_loc, 'w', 'utf8')
    for raw_text, segmented_text, annot_tuples in gold_tuples:
        tokens = nlp(raw_text)
        for t in tokens:
            out_file.write(
                '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
            )
    return scorer


@plac.annotations(
    train_loc=("Training file location",),
    dev_loc=("Dev. file location",),
    model_dir=("Location of output model directory",),
    out_loc=("Out location", "option", "o", str),
    n_sents=("Number of training sentences", "option", "n", int),
    verbose=("Verbose error reporting", "flag", "v", bool),
    debug=("Debug mode", "flag", "d", bool)
)
def main(train_loc, dev_loc, model_dir, n_sents=0, out_loc="", verbose=False,
         debug=False):
    train(English, train_loc, model_dir, feat_set='basic' if not debug else 'debug',
          gold_preproc=False, n_sents=n_sents)
    if out_loc:
        write_parses(English, dev_loc, model_dir, out_loc)
    scorer = evaluate(English, dev_loc, model_dir, gold_preproc=False, verbose=verbose)
    print 'TOK', scorer.mistokened
    print 'POS', scorer.tags_acc
    print 'UAS', scorer.uas
    print 'LAS', scorer.las

    print 'NER P', scorer.ents_p
    print 'NER R', scorer.ents_r
    print 'NER F', scorer.ents_f


if __name__ == '__main__':
    plac.call(main)
* Add parser training script 2015-01-09 17:53:26 +00:00			`#!/usr/bin/env python`
			`from __future__ import division`
			`from __future__ import unicode_literals`

			`import os`
			`from os import path`
			`import shutil`
			`import codecs`
			`import random`

			`import plac`
			`import cProfile`
			`import pstats`

			`import spacy.util`
			`from spacy.en import English`
			`from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir`

			`from spacy.syntax.parser import GreedyParser`
* Fix standard conll file reading. Script needs refactoring. 2015-02-02 12:02:48 +00:00			`from spacy.syntax.parser import OracleError`
* Add parser training script 2015-01-09 17:53:26 +00:00			`from spacy.syntax.util import Config`
* Work on updating train script for named entity recognition 2015-03-09 05:46:53 +00:00			`from spacy.syntax.conll import read_docparse_file`
			`from spacy.syntax.conll import GoldParse`
* Add parser training script 2015-01-09 17:53:26 +00:00
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`from spacy.scorer import Scorer`

* Add parser training script 2015-01-09 17:53:26 +00:00
* Work on updating train script for named entity recognition 2015-03-09 05:46:53 +00:00			`def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,`
* Remove support for force_gold flag from GreedyParser, since it's not so useful, and it's clutter 2015-03-24 04:12:37 +00:00			`gold_preproc=False, n_sents=0):`
* Add parser training script 2015-01-09 17:53:26 +00:00			`dep_model_dir = path.join(model_dir, 'deps')`
			`pos_model_dir = path.join(model_dir, 'pos')`
* Work on updating train script for named entity recognition 2015-03-09 05:46:53 +00:00			`ner_model_dir = path.join(model_dir, 'ner')`
* Add parser training script 2015-01-09 17:53:26 +00:00			`if path.exists(dep_model_dir):`
			`shutil.rmtree(dep_model_dir)`
			`if path.exists(pos_model_dir):`
			`shutil.rmtree(pos_model_dir)`
* Work on updating train script for named entity recognition 2015-03-09 05:46:53 +00:00			`if path.exists(ner_model_dir):`
			`shutil.rmtree(ner_model_dir)`
* Add parser training script 2015-01-09 17:53:26 +00:00			`os.mkdir(dep_model_dir)`
			`os.mkdir(pos_model_dir)`
* Work on updating train script for named entity recognition 2015-03-09 05:46:53 +00:00			`os.mkdir(ner_model_dir)`

			`setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)`

			`gold_tuples = read_docparse_file(train_loc)`
* Add parser training script 2015-01-09 17:53:26 +00:00
			`Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,`
* Work on updating train script for named entity recognition 2015-03-09 05:46:53 +00:00			`labels=Language.ParserTransitionSystem.get_labels(gold_tuples))`
* Refactoring working for parser, but now need to rig up features for NER, and then debug oracle etc. 2015-03-09 11:06:01 +00:00			`Config.write(ner_model_dir, 'config', features='ner', seed=seed,`
* Work on updating train script for named entity recognition 2015-03-09 05:46:53 +00:00			`labels=Language.EntityTransitionSystem.get_labels(gold_tuples))`

* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`if n_sents > 0:`
			`gold_tuples = gold_tuples[:n_sents]`
* Respect the model_dir input parameter to train.py 2015-04-08 20:48:26 +00:00			`nlp = Language(data_dir=model_dir)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00
			`print "Itn.\tUAS\tNER F.\tTag %"`
* Add parser training script 2015-01-09 17:53:26 +00:00			`for itn in range(n_iter):`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`scorer = Scorer()`
* Work on updating train script for named entity recognition 2015-03-09 05:46:53 +00:00			`for raw_text, segmented_text, annot_tuples in gold_tuples:`
* Ensure better separation between score printing and training in train.py 2015-03-24 03:25:38 +00:00			`# Eval before train`
* Use merge_mwe=False in evaluation in train.py 2015-04-07 22:35:19 +00:00			`tokens = nlp(raw_text, merge_mwes=False)`
* Ensure better separation between score printing and training in train.py 2015-03-24 03:25:38 +00:00			`gold = GoldParse(tokens, annot_tuples)`
			`scorer.score(tokens, gold, verbose=False)`

* Allow gold tokenization training, for debugging 2015-03-08 05:17:12 +00:00			`if gold_preproc:`
* Work on updating train script for named entity recognition 2015-03-09 05:46:53 +00:00			`sents = [nlp.tokenizer.tokens_from_list(s) for s in segmented_text]`
* Allow gold tokenization training, for debugging 2015-03-08 05:17:12 +00:00			`else:`
* Work on updating train script for named entity recognition 2015-03-09 05:46:53 +00:00			`sents = [nlp.tokenizer(raw_text)]`
			`for tokens in sents:`
* Refactoring working for parser, but now need to rig up features for NER, and then debug oracle etc. 2015-03-09 11:06:01 +00:00			`gold = GoldParse(tokens, annot_tuples)`
* Work on updating train script for named entity recognition 2015-03-09 05:46:53 +00:00			`nlp.tagger(tokens)`
* Remove support for force_gold flag from GreedyParser, since it's not so useful, and it's clutter 2015-03-24 04:12:37 +00:00			`nlp.parser.train(tokens, gold)`
* Clean up train.py 2015-04-15 04:02:04 +00:00			`if gold.ents:`
			`nlp.entity.train(tokens, gold)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`nlp.tagger.train(tokens, gold.tags)`
* Move scoring away from training. Does not support scoring on gold preproc. 2015-03-23 16:32:55 +00:00
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`print '%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.ents_f, scorer.tags_acc)`
* Work on updating train script for named entity recognition 2015-03-09 05:46:53 +00:00			`random.shuffle(gold_tuples)`
* Add parser training script 2015-01-09 17:53:26 +00:00			`nlp.parser.model.end_training()`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`nlp.entity.model.end_training()`
* Add parser training script 2015-01-09 17:53:26 +00:00			`nlp.tagger.model.end_training()`
* Ensure StringStore is dumped during training 2015-03-25 00:08:24 +00:00			`nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))`
* Add parser training script 2015-01-09 17:53:26 +00:00

* Move to fixing up ent_strings and dep_strings passing 2015-03-14 15:09:55 +00:00			`def evaluate(Language, dev_loc, model_dir, gold_preproc=False, verbose=True):`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`assert not gold_preproc`
* Respect the model_dir input parameter to train.py 2015-04-08 20:48:26 +00:00			`nlp = Language(data_dir=model_dir)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`gold_tuples = read_docparse_file(dev_loc)`
			`scorer = Scorer()`
* Work on updating train script for named entity recognition 2015-03-09 05:46:53 +00:00			`for raw_text, segmented_text, annot_tuples in gold_tuples:`
* Use merge_mwe=False in evaluation in train.py 2015-04-07 22:35:19 +00:00			`tokens = nlp(raw_text, merge_mwes=False)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`gold = GoldParse(tokens, annot_tuples)`
* Add verbose flag for Scorer, for debugging, and fix ent_strings bug 2015-03-11 06:27:22 +00:00			`scorer.score(tokens, gold, verbose=verbose)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`return scorer`
* Allow gold tokenization training, for debugging 2015-03-08 05:17:12 +00:00

* Add write_parses function 2015-03-20 00:14:20 +00:00			`def write_parses(Language, dev_loc, model_dir, out_loc):`
			`nlp = Language()`
			`gold_tuples = read_docparse_file(dev_loc)`
			`scorer = Scorer()`
			`out_file = codecs.open(out_loc, 'w', 'utf8')`
			`for raw_text, segmented_text, annot_tuples in gold_tuples:`
			`tokens = nlp(raw_text)`
			`for t in tokens:`
			`out_file.write(`
			`'%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)`
			`)`
			`return scorer`


* Tmp commit 2015-02-23 19:05:04 +00:00			`@plac.annotations(`
			`train_loc=("Training file location",),`
			`dev_loc=("Dev. file location",),`
			`model_dir=("Location of output model directory",),`
* Add write_parses function 2015-03-20 00:14:20 +00:00			`out_loc=("Out location", "option", "o", str),`
* Move to fixing up ent_strings and dep_strings passing 2015-03-14 15:09:55 +00:00			`n_sents=("Number of training sentences", "option", "n", int),`
			`verbose=("Verbose error reporting", "flag", "v", bool),`
* Ensure better separation between score printing and training in train.py 2015-03-24 03:25:38 +00:00			`debug=("Debug mode", "flag", "d", bool)`
* Tmp commit 2015-02-23 19:05:04 +00:00			`)`
* Ensure better separation between score printing and training in train.py 2015-03-24 03:25:38 +00:00			`def main(train_loc, dev_loc, model_dir, n_sents=0, out_loc="", verbose=False,`
			`debug=False):`
			`train(English, train_loc, model_dir, feat_set='basic' if not debug else 'debug',`
* Remove support for force_gold flag from GreedyParser, since it's not so useful, and it's clutter 2015-03-24 04:12:37 +00:00			`gold_preproc=False, n_sents=n_sents)`
* Add write_parses function 2015-03-20 00:14:20 +00:00			`if out_loc:`
			`write_parses(English, dev_loc, model_dir, out_loc)`
* Move to fixing up ent_strings and dep_strings passing 2015-03-14 15:09:55 +00:00			`scorer = evaluate(English, dev_loc, model_dir, gold_preproc=False, verbose=verbose)`
* Ensure better separation between score printing and training in train.py 2015-03-24 03:25:38 +00:00			`print 'TOK', scorer.mistokened`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`print 'POS', scorer.tags_acc`
			`print 'UAS', scorer.uas`
			`print 'LAS', scorer.las`

			`print 'NER P', scorer.ents_p`
			`print 'NER R', scorer.ents_r`
			`print 'NER F', scorer.ents_f`
Remove trailing whitespace 2015-04-19 08:31:31 +00:00
* Add parser training script 2015-01-09 17:53:26 +00:00
			`if __name__ == '__main__':`
			`plac.call(main)`