From fc7521094195c253ec9ff54c7dcb980241e90305 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 24 May 2015 21:35:02 +0200 Subject: [PATCH] * Move spacy.syntax.conll to spacy.gold --- bin/parser/train.py | 19 +++++++++++-------- setup.py | 2 +- spacy/{syntax/conll.pxd => gold.pxd} | 4 ++-- spacy/{syntax/conll.pyx => gold.pyx} | 2 +- spacy/syntax/arc_eager.pyx | 2 +- spacy/syntax/ner.pyx | 2 +- spacy/syntax/parser.pyx | 9 ++++++++- spacy/syntax/transition_system.pxd | 2 +- 8 files changed, 26 insertions(+), 16 deletions(-) rename spacy/{syntax/conll.pxd => gold.pxd} (87%) rename spacy/{syntax/conll.pyx => gold.pyx} (99%) diff --git a/bin/parser/train.py b/bin/parser/train.py index 28cb34b23..e58f57090 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -20,8 +20,8 @@ from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir from spacy.syntax.parser import GreedyParser from spacy.syntax.parser import OracleError from spacy.syntax.util import Config -from spacy.syntax.conll import read_json_file -from spacy.syntax.conll import GoldParse +from spacy.gold import read_json_file +from spacy.gold import GoldParse from spacy.scorer import Scorer @@ -65,11 +65,13 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0 gold_tuples = gold_tuples[:n_sents] nlp = Language(data_dir=model_dir) - print "Itn.\tUAS\tNER F.\tTag %\tToken %" + print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %" for itn in range(n_iter): scorer = Scorer() + loss = 0 for raw_text, annot_tuples, ctnt in gold_tuples: - raw_text = ''.join(add_noise(c, corruption_level) for c in raw_text) + if corruption_level != 0: + raw_text = ''.join(add_noise(c, corruption_level) for c in raw_text) tokens = nlp(raw_text, merge_mwes=False) gold = GoldParse(tokens, annot_tuples) scorer.score(tokens, gold, verbose=False) @@ -79,7 +81,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0 gold = GoldParse(tokens, annot_tuples) nlp.tagger(tokens) try: - nlp.parser.train(tokens, gold) + loss += nlp.parser.train(tokens, gold) except AssertionError: # TODO: Do something about non-projective sentences continue @@ -87,7 +89,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0 nlp.entity.train(tokens, gold) nlp.tagger.train(tokens, gold.tags) - print '%d:\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.ents_f, + print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, scorer.tags_acc, scorer.token_acc) random.shuffle(gold_tuples) @@ -148,15 +150,16 @@ def get_sents(json_loc): model_dir=("Location of output model directory",), out_loc=("Out location", "option", "o", str), n_sents=("Number of training sentences", "option", "n", int), + n_iter=("Number of training iterations", "option", "i", int), verbose=("Verbose error reporting", "flag", "v", bool), debug=("Debug mode", "flag", "d", bool) ) -def main(train_loc, dev_loc, model_dir, n_sents=0, out_loc="", verbose=False, +def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, debug=False, corruption_level=0.0): train(English, read_json_file(train_loc), model_dir, feat_set='basic' if not debug else 'debug', gold_preproc=False, n_sents=n_sents, - corruption_level=corruption_level) + corruption_level=corruption_level, n_iter=n_iter) if out_loc: write_parses(English, dev_loc, model_dir, out_loc) scorer = evaluate(English, read_json_file(dev_loc), diff --git a/setup.py b/setup.py index 837d8923f..ee67cd378 100644 --- a/setup.py +++ b/setup.py @@ -152,7 +152,7 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.en.pos', 'spacy.syntax.parser', 'spacy.syntax._state', 'spacy.syntax.transition_system', 'spacy.syntax.arc_eager', 'spacy.syntax._parse_features', - 'spacy.syntax.conll', 'spacy.orth', + 'spacy.gold', 'spacy.orth', 'spacy.syntax.ner'] diff --git a/spacy/syntax/conll.pxd b/spacy/gold.pxd similarity index 87% rename from spacy/syntax/conll.pxd rename to spacy/gold.pxd index 6fc27b151..037a2a4ee 100644 --- a/spacy/syntax/conll.pxd +++ b/spacy/gold.pxd @@ -1,7 +1,7 @@ from cymem.cymem cimport Pool -from ..structs cimport TokenC -from .transition_system cimport Transition +from .structs cimport TokenC +from .syntax.transition_system cimport Transition cimport numpy diff --git a/spacy/syntax/conll.pyx b/spacy/gold.pyx similarity index 99% rename from spacy/syntax/conll.pyx rename to spacy/gold.pyx index f0a4e20c2..df34afa74 100644 --- a/spacy/syntax/conll.pyx +++ b/spacy/gold.pyx @@ -2,7 +2,7 @@ import numpy import codecs import json import random -from spacy.munge.alignment import align +from .munge.alignment import align from libc.string cimport memset diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index cb0918606..8de4b8a74 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -10,7 +10,7 @@ from ._state cimport count_left_kids from ..structs cimport TokenC from .transition_system cimport do_func_t, get_cost_func_t -from .conll cimport GoldParse +from ..gold cimport GoldParse DEF NON_MONOTONIC = True diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 4a4da15d2..2189f407e 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -8,7 +8,7 @@ from .transition_system cimport do_func_t from ..structs cimport TokenC, Entity from thinc.typedefs cimport weight_t -from .conll cimport GoldParse +from ..gold cimport GoldParse cdef enum: diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 36acce3de..5502f224b 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -30,7 +30,7 @@ from .arc_eager cimport TransitionSystem, Transition from .transition_system import OracleError from ._state cimport new_state, State, is_final, get_idx, get_s0, get_s1, get_n0, get_n1 -from .conll cimport GoldParse +from ..gold cimport GoldParse from . import _parse_features from ._parse_features cimport fill_context, CONTEXT_SIZE @@ -107,14 +107,21 @@ cdef class GreedyParser: cdef Transition guess cdef Transition best cdef atom_t[CONTEXT_SIZE] context + loss = 0 while not is_final(state): + fill_context(context, state) scores = self.model.score(context) guess = self.moves.best_valid(scores, state) best = self.moves.best_gold(scores, state, gold) + #print self.moves.move_name(guess.move, guess.label), + #print self.moves.move_name(best.move, best.label), + #print print_state(state, py_words) cost = guess.get_cost(&guess, state, gold) self.model.update(context, guess.clas, best.clas, cost) guess.do(&guess, state) + loss += cost self.moves.finalize_state(state) + return loss diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index 44fe43949..3ac1b62f6 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -3,7 +3,7 @@ from thinc.typedefs cimport weight_t from ..structs cimport TokenC from ._state cimport State -from .conll cimport GoldParse +from ..gold cimport GoldParse from ..strings cimport StringStore