spaCy/bin/parser/train.py

#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals

import os
from os import path
import shutil
import codecs
import random

import plac
import cProfile
import pstats
import re

import spacy.util
from spacy.en import English
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir

from spacy.syntax.parser import GreedyParser
from spacy.syntax.parser import OracleError
from spacy.syntax.util import Config
from spacy.gold import read_json_file
from spacy.gold import GoldParse

from spacy.scorer import Scorer


def add_noise(c, noise_level):
    if random.random() >= noise_level:
        return c
    elif c == ' ':
        return '\n'
    elif c == '\n':
        return ' '
    elif c in ['.', "'", "!", "?"]:
        return ''
    else:
        return c.lower()


def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
          gold_preproc=False, n_sents=0, corruption_level=0):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    ner_model_dir = path.join(model_dir, 'ner')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    if path.exists(ner_model_dir):
        shutil.rmtree(ner_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)
    os.mkdir(ner_model_dir)

    setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)

    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 labels=Language.ParserTransitionSystem.get_labels(gold_tuples))
    Config.write(ner_model_dir, 'config', features='ner', seed=seed,
                 labels=Language.EntityTransitionSystem.get_labels(gold_tuples))

    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]
    nlp = Language(data_dir=model_dir)

    print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %"
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for raw_text, annot_tuples, ctnt in gold_tuples:
            if corruption_level != 0:
                raw_text = ''.join(add_noise(c, corruption_level) for c in raw_text)
            tokens = nlp(raw_text, merge_mwes=False)
            gold = GoldParse(tokens, annot_tuples)
            scorer.score(tokens, gold, verbose=False)
            assert not gold_preproc
            sents = [nlp.tokenizer(raw_text)]
            for tokens in sents:
                gold = GoldParse(tokens, annot_tuples)
                nlp.tagger(tokens)
                try:
                    loss += nlp.parser.train(tokens, gold)
                except AssertionError:
                    # TODO: Do something about non-projective sentences
                    continue
                if gold.ents:
                    nlp.entity.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)

        print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
                                               scorer.tags_acc,
                                               scorer.token_acc)
        random.shuffle(gold_tuples)
    nlp.parser.model.end_training()
    nlp.entity.model.end_training()
    nlp.tagger.model.end_training()
    nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))


def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=True):
    assert not gold_preproc
    nlp = Language(data_dir=model_dir)
    scorer = Scorer()
    for raw_text, annot_tuples, brackets in gold_tuples:
        tokens = nlp(raw_text, merge_mwes=False)
        gold = GoldParse(tokens, annot_tuples)
        scorer.score(tokens, gold, verbose=verbose)
    return scorer


def write_parses(Language, dev_loc, model_dir, out_loc):
    nlp = Language()
    gold_tuples = read_docparse_file(dev_loc)
    scorer = Scorer()
    out_file = codecs.open(out_loc, 'w', 'utf8')
    for raw_text, segmented_text, annot_tuples in gold_tuples:
        tokens = nlp(raw_text)
        for t in tokens:
            out_file.write(
                '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
            )
    return scorer


def get_sents(json_loc):
    if path.exists(path.join(json_dir, section + '.json')):
        for sent in read_json_file(path.join(json_dir, section + '.json')):
            yield sent
    else:
        if section == 'train':
            file_range = range(2, 22)
        elif section == 'dev':
            file_range = range(22, 23)

        for i in file_range:
            sec = str(i)
            if len(sec) == 1:
                sec = '0' + sec
            loc = path.join(json_dir, sec + '.json')
            for sent in read_json_file(loc):
                yield sent


@plac.annotations(
    train_loc=("Location of training json file"),
    dev_loc=("Location of development json file"),
    corruption_level=("Amount of noise to add to training data", "option", "c", float),
    model_dir=("Location of output model directory",),
    out_loc=("Out location", "option", "o", str),
    n_sents=("Number of training sentences", "option", "n", int),
    n_iter=("Number of training iterations", "option", "i", int),
    verbose=("Verbose error reporting", "flag", "v", bool),
    debug=("Debug mode", "flag", "d", bool)
)
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
         debug=False, corruption_level=0.0):
    train(English, read_json_file(train_loc), model_dir,
          feat_set='basic' if not debug else 'debug',
          gold_preproc=False, n_sents=n_sents,
          corruption_level=corruption_level, n_iter=n_iter)
    if out_loc:
        write_parses(English, dev_loc, model_dir, out_loc)
    scorer = evaluate(English, read_json_file(dev_loc),
                      model_dir, gold_preproc=False, verbose=verbose)
    print 'TOK', 100-scorer.token_acc
    print 'POS', scorer.tags_acc
    print 'UAS', scorer.uas
    print 'LAS', scorer.las

    print 'NER P', scorer.ents_p
    print 'NER R', scorer.ents_r
    print 'NER F', scorer.ents_f


if __name__ == '__main__':
    plac.call(main)
* Add parser training script 2015-01-09 17:53:26 +00:00			`#!/usr/bin/env python`
			`from __future__ import division`
			`from __future__ import unicode_literals`

			`import os`
			`from os import path`
			`import shutil`
			`import codecs`
			`import random`

			`import plac`
			`import cProfile`
			`import pstats`
* Tmp commit of train, while I move to better alignment in gold standard 2015-05-23 15:21:25 +00:00			`import re`
* Add parser training script 2015-01-09 17:53:26 +00:00
			`import spacy.util`
			`from spacy.en import English`
			`from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir`

			`from spacy.syntax.parser import GreedyParser`
* Fix standard conll file reading. Script needs refactoring. 2015-02-02 12:02:48 +00:00			`from spacy.syntax.parser import OracleError`
* Add parser training script 2015-01-09 17:53:26 +00:00			`from spacy.syntax.util import Config`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 19:35:02 +00:00			`from spacy.gold import read_json_file`
			`from spacy.gold import GoldParse`
* Add parser training script 2015-01-09 17:53:26 +00:00
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`from spacy.scorer import Scorer`

* Add parser training script 2015-01-09 17:53:26 +00:00
* Tmp commit 2015-05-24 00:50:14 +00:00			`def add_noise(c, noise_level):`
			`if random.random() >= noise_level:`
			`return c`
			`elif c == ' ':`
			`return '\n'`
			`elif c == '\n':`
			`return ' '`
			`elif c in ['.', "'", "!", "?"]:`
			`return ''`
			`else:`
			`return c.lower()`


* Use JSON docs for training and evaluation. Currently a bug that is costing 0.6 acc 2015-05-06 14:38:54 +00:00			`def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,`
* Tmp commit 2015-05-24 00:50:14 +00:00			`gold_preproc=False, n_sents=0, corruption_level=0):`
* Add parser training script 2015-01-09 17:53:26 +00:00			`dep_model_dir = path.join(model_dir, 'deps')`
			`pos_model_dir = path.join(model_dir, 'pos')`
* Work on updating train script for named entity recognition 2015-03-09 05:46:53 +00:00			`ner_model_dir = path.join(model_dir, 'ner')`
* Add parser training script 2015-01-09 17:53:26 +00:00			`if path.exists(dep_model_dir):`
			`shutil.rmtree(dep_model_dir)`
			`if path.exists(pos_model_dir):`
			`shutil.rmtree(pos_model_dir)`
* Work on updating train script for named entity recognition 2015-03-09 05:46:53 +00:00			`if path.exists(ner_model_dir):`
			`shutil.rmtree(ner_model_dir)`
* Add parser training script 2015-01-09 17:53:26 +00:00			`os.mkdir(dep_model_dir)`
			`os.mkdir(pos_model_dir)`
* Work on updating train script for named entity recognition 2015-03-09 05:46:53 +00:00			`os.mkdir(ner_model_dir)`

			`setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)`

* Add parser training script 2015-01-09 17:53:26 +00:00			`Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,`
* Work on updating train script for named entity recognition 2015-03-09 05:46:53 +00:00			`labels=Language.ParserTransitionSystem.get_labels(gold_tuples))`
* Refactoring working for parser, but now need to rig up features for NER, and then debug oracle etc. 2015-03-09 11:06:01 +00:00			`Config.write(ner_model_dir, 'config', features='ner', seed=seed,`
* Work on updating train script for named entity recognition 2015-03-09 05:46:53 +00:00			`labels=Language.EntityTransitionSystem.get_labels(gold_tuples))`

* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`if n_sents > 0:`
			`gold_tuples = gold_tuples[:n_sents]`
* Respect the model_dir input parameter to train.py 2015-04-08 20:48:26 +00:00			`nlp = Language(data_dir=model_dir)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00
* Move spacy.syntax.conll to spacy.gold 2015-05-24 19:35:02 +00:00			`print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %"`
* Add parser training script 2015-01-09 17:53:26 +00:00			`for itn in range(n_iter):`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`scorer = Scorer()`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 19:35:02 +00:00			`loss = 0`
* Tmp commit 2015-05-24 00:50:14 +00:00			`for raw_text, annot_tuples, ctnt in gold_tuples:`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 19:35:02 +00:00			`if corruption_level != 0:`
			`raw_text = ''.join(add_noise(c, corruption_level) for c in raw_text)`
* Use merge_mwe=False in evaluation in train.py 2015-04-07 22:35:19 +00:00			`tokens = nlp(raw_text, merge_mwes=False)`
* Ensure better separation between score printing and training in train.py 2015-03-24 03:25:38 +00:00			`gold = GoldParse(tokens, annot_tuples)`
			`scorer.score(tokens, gold, verbose=False)`
* Tmp commit 2015-05-24 00:50:14 +00:00			`assert not gold_preproc`
			`sents = [nlp.tokenizer(raw_text)]`
* Work on updating train script for named entity recognition 2015-03-09 05:46:53 +00:00			`for tokens in sents:`
* Refactoring working for parser, but now need to rig up features for NER, and then debug oracle etc. 2015-03-09 11:06:01 +00:00			`gold = GoldParse(tokens, annot_tuples)`
* Work on updating train script for named entity recognition 2015-03-09 05:46:53 +00:00			`nlp.tagger(tokens)`
* Tmp commit of train, while I move to better alignment in gold standard 2015-05-23 15:21:25 +00:00			`try:`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 19:35:02 +00:00			`loss += nlp.parser.train(tokens, gold)`
* Tmp commit of train, while I move to better alignment in gold standard 2015-05-23 15:21:25 +00:00			`except AssertionError:`
			`# TODO: Do something about non-projective sentences`
			`continue`
* Clean up train.py 2015-04-15 04:02:04 +00:00			`if gold.ents:`
			`nlp.entity.train(tokens, gold)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`nlp.tagger.train(tokens, gold.tags)`
* Move scoring away from training. Does not support scoring on gold preproc. 2015-03-23 16:32:55 +00:00
* Move spacy.syntax.conll to spacy.gold 2015-05-24 19:35:02 +00:00			`print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,`
* Tmp commit of train, while I move to better alignment in gold standard 2015-05-23 15:21:25 +00:00			`scorer.tags_acc,`
			`scorer.token_acc)`
* Restore shuffling, and remove print statements from train.py 2015-05-07 20:52:27 +00:00			`random.shuffle(gold_tuples)`
* Add parser training script 2015-01-09 17:53:26 +00:00			`nlp.parser.model.end_training()`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`nlp.entity.model.end_training()`
* Add parser training script 2015-01-09 17:53:26 +00:00			`nlp.tagger.model.end_training()`
* Ensure StringStore is dumped during training 2015-03-25 00:08:24 +00:00			`nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))`
* Add parser training script 2015-01-09 17:53:26 +00:00

* Use JSON docs for training and evaluation. Currently a bug that is costing 0.6 acc 2015-05-06 14:38:54 +00:00			`def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=True):`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`assert not gold_preproc`
* Respect the model_dir input parameter to train.py 2015-04-08 20:48:26 +00:00			`nlp = Language(data_dir=model_dir)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`scorer = Scorer()`
* Tmp commit 2015-05-24 00:50:14 +00:00			`for raw_text, annot_tuples, brackets in gold_tuples:`
* Use merge_mwe=False in evaluation in train.py 2015-04-07 22:35:19 +00:00			`tokens = nlp(raw_text, merge_mwes=False)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`gold = GoldParse(tokens, annot_tuples)`
* Add verbose flag for Scorer, for debugging, and fix ent_strings bug 2015-03-11 06:27:22 +00:00			`scorer.score(tokens, gold, verbose=verbose)`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`return scorer`
* Allow gold tokenization training, for debugging 2015-03-08 05:17:12 +00:00

* Add write_parses function 2015-03-20 00:14:20 +00:00			`def write_parses(Language, dev_loc, model_dir, out_loc):`
			`nlp = Language()`
			`gold_tuples = read_docparse_file(dev_loc)`
			`scorer = Scorer()`
			`out_file = codecs.open(out_loc, 'w', 'utf8')`
			`for raw_text, segmented_text, annot_tuples in gold_tuples:`
			`tokens = nlp(raw_text)`
			`for t in tokens:`
			`out_file.write(`
			`'%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)`
			`)`
			`return scorer`


* Tmp commit 2015-05-24 00:50:14 +00:00			`def get_sents(json_loc):`
* Tmp commit of train, while I move to better alignment in gold standard 2015-05-23 15:21:25 +00:00			`if path.exists(path.join(json_dir, section + '.json')):`
			`for sent in read_json_file(path.join(json_dir, section + '.json')):`
* Use JSON docs for training and evaluation. Currently a bug that is costing 0.6 acc 2015-05-06 14:38:54 +00:00			`yield sent`
* Tmp commit of train, while I move to better alignment in gold standard 2015-05-23 15:21:25 +00:00			`else:`
			`if section == 'train':`
			`file_range = range(2, 22)`
			`elif section == 'dev':`
			`file_range = range(22, 23)`

			`for i in file_range:`
			`sec = str(i)`
			`if len(sec) == 1:`
			`sec = '0' + sec`
			`loc = path.join(json_dir, sec + '.json')`
			`for sent in read_json_file(loc):`
			`yield sent`
* Use JSON docs for training and evaluation. Currently a bug that is costing 0.6 acc 2015-05-06 14:38:54 +00:00

* Tmp commit 2015-02-23 19:05:04 +00:00			`@plac.annotations(`
* Tmp commit 2015-05-24 00:50:14 +00:00			`train_loc=("Location of training json file"),`
			`dev_loc=("Location of development json file"),`
			`corruption_level=("Amount of noise to add to training data", "option", "c", float),`
* Tmp commit 2015-02-23 19:05:04 +00:00			`model_dir=("Location of output model directory",),`
* Add write_parses function 2015-03-20 00:14:20 +00:00			`out_loc=("Out location", "option", "o", str),`
* Move to fixing up ent_strings and dep_strings passing 2015-03-14 15:09:55 +00:00			`n_sents=("Number of training sentences", "option", "n", int),`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 19:35:02 +00:00			`n_iter=("Number of training iterations", "option", "i", int),`
* Move to fixing up ent_strings and dep_strings passing 2015-03-14 15:09:55 +00:00			`verbose=("Verbose error reporting", "flag", "v", bool),`
* Ensure better separation between score printing and training in train.py 2015-03-24 03:25:38 +00:00			`debug=("Debug mode", "flag", "d", bool)`
* Tmp commit 2015-02-23 19:05:04 +00:00			`)`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 19:35:02 +00:00			`def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,`
* Tmp commit 2015-05-24 00:50:14 +00:00			`debug=False, corruption_level=0.0):`
			`train(English, read_json_file(train_loc), model_dir,`
* Use JSON docs for training and evaluation. Currently a bug that is costing 0.6 acc 2015-05-06 14:38:54 +00:00			`feat_set='basic' if not debug else 'debug',`
* Tmp commit 2015-05-24 00:50:14 +00:00			`gold_preproc=False, n_sents=n_sents,`
* Move spacy.syntax.conll to spacy.gold 2015-05-24 19:35:02 +00:00			`corruption_level=corruption_level, n_iter=n_iter)`
* Add write_parses function 2015-03-20 00:14:20 +00:00			`if out_loc:`
			`write_parses(English, dev_loc, model_dir, out_loc)`
* Tmp commit 2015-05-24 00:50:14 +00:00			`scorer = evaluate(English, read_json_file(dev_loc),`
* Use JSON docs for training and evaluation. Currently a bug that is costing 0.6 acc 2015-05-06 14:38:54 +00:00			`model_dir, gold_preproc=False, verbose=verbose)`
* Tmp commit of train, while I move to better alignment in gold standard 2015-05-23 15:21:25 +00:00			`print 'TOK', 100-scorer.token_acc`
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`print 'POS', scorer.tags_acc`
			`print 'UAS', scorer.uas`
			`print 'LAS', scorer.las`

			`print 'NER P', scorer.ents_p`
			`print 'NER R', scorer.ents_r`
			`print 'NER F', scorer.ents_f`
Remove trailing whitespace 2015-04-19 08:31:31 +00:00
* Add parser training script 2015-01-09 17:53:26 +00:00
			`if __name__ == '__main__':`
			`plac.call(main)`