mirror of https://github.com/explosion/spaCy.git
* Work on updating train script for named entity recognition
This commit is contained in:
parent
357dcdcc01
commit
4539c70542
|
@ -21,7 +21,8 @@ from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
|
|||
from spacy.syntax.parser import GreedyParser
|
||||
from spacy.syntax.parser import OracleError
|
||||
from spacy.syntax.util import Config
|
||||
from spacy.syntax.conll import GoldParse, is_punct_label
|
||||
from spacy.syntax.conll import read_docparse_file
|
||||
from spacy.syntax.conll import GoldParse
|
||||
|
||||
|
||||
def is_punct_label(label):
|
||||
|
@ -183,47 +184,56 @@ def get_labels(sents):
|
|||
return list(sorted(left_labels)), list(sorted(right_labels))
|
||||
|
||||
|
||||
def train(Language, paragraphs, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
||||
gold_preproc=False, force_gold=False):
|
||||
def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
||||
gold_preproc=False, force_gold=False, n_sents=0):
|
||||
print "Setup model dir"
|
||||
dep_model_dir = path.join(model_dir, 'deps')
|
||||
pos_model_dir = path.join(model_dir, 'pos')
|
||||
ner_model_dir = path.join(model_dir, 'ner')
|
||||
if path.exists(dep_model_dir):
|
||||
shutil.rmtree(dep_model_dir)
|
||||
if path.exists(pos_model_dir):
|
||||
shutil.rmtree(pos_model_dir)
|
||||
if path.exists(ner_model_dir):
|
||||
shutil.rmtree(ner_model_dir)
|
||||
os.mkdir(dep_model_dir)
|
||||
os.mkdir(pos_model_dir)
|
||||
setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES,
|
||||
pos_model_dir)
|
||||
os.mkdir(ner_model_dir)
|
||||
|
||||
setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
|
||||
|
||||
gold_tuples = read_docparse_file(train_loc)
|
||||
|
||||
labels = Language.ParserTransitionSystem.get_labels(gold_sents)
|
||||
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
||||
labels=labels)
|
||||
labels=Language.ParserTransitionSystem.get_labels(gold_tuples))
|
||||
Config.write(ner_model_dir, 'config', features=feat_set, seed=seed,
|
||||
labels=Language.EntityTransitionSystem.get_labels(gold_tuples))
|
||||
|
||||
nlp = Language()
|
||||
|
||||
for itn in range(n_iter):
|
||||
heads_corr = 0
|
||||
dep_corr = 0
|
||||
pos_corr = 0
|
||||
n_tokens = 0
|
||||
n_all_tokens = 0
|
||||
for gold_sent in gold_sents:
|
||||
for raw_text, segmented_text, annot_tuples in gold_tuples:
|
||||
if gold_preproc:
|
||||
#print ' '.join(gold_sent.words)
|
||||
tokens = nlp.tokenizer.tokens_from_list(gold_sent.words)
|
||||
gold_sent.map_heads(nlp.parser.moves.label_ids)
|
||||
sents = [nlp.tokenizer.tokens_from_list(s) for s in segmented_text]
|
||||
else:
|
||||
tokens = nlp.tokenizer(gold_sent.raw_text)
|
||||
gold_sent.align_to_tokens(tokens, nlp.parser.moves.label_ids)
|
||||
nlp.tagger(tokens)
|
||||
heads_corr += nlp.parser.train(tokens, gold_sent, force_gold=force_gold)
|
||||
pos_corr += nlp.tagger.train(tokens, gold_sent.tags)
|
||||
n_tokens += gold_sent.n_non_punct
|
||||
n_all_tokens += len(tokens)
|
||||
acc = float(heads_corr) / n_tokens
|
||||
pos_acc = float(pos_corr) / n_all_tokens
|
||||
sents = [nlp.tokenizer(raw_text)]
|
||||
for tokens in sents:
|
||||
|
||||
gold = GoldParse(tokens, annot_tuples, nlp.tags,
|
||||
nlp.parser.moves.label_ids,
|
||||
nlp.entity.moves.label_ids)
|
||||
|
||||
nlp.tagger(tokens)
|
||||
dep_corr += nlp.parser.train(tokens, gold, force_gold=force_gold)
|
||||
pos_corr += nlp.tagger.train(tokens, gold.tags_)
|
||||
n_tokens += len(tokens)
|
||||
acc = float(dep_corr) / n_tokens
|
||||
pos_acc = float(pos_corr) / n_tokens
|
||||
print '%d: ' % itn, '%.3f' % acc, '%.3f' % pos_acc
|
||||
random.shuffle(gold_sents)
|
||||
random.shuffle(gold_tuples)
|
||||
nlp.parser.model.end_training()
|
||||
nlp.tagger.model.end_training()
|
||||
return acc
|
||||
|
@ -239,22 +249,22 @@ def evaluate(Language, dev_loc, model_dir, gold_preproc=False):
|
|||
total = 0
|
||||
skipped = 0
|
||||
loss = 0
|
||||
with codecs.open(dev_loc, 'r', 'utf8') as file_:
|
||||
#paragraphs = read_tokenized_gold(file_)
|
||||
paragraphs = read_docparse_gold(file_)
|
||||
for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer,
|
||||
gold_preproc=gold_preproc):
|
||||
assert len(tokens) == len(labels)
|
||||
nlp.tagger(tokens)
|
||||
nlp.parser(tokens)
|
||||
gold_tuples = read_docparse_file(train_loc)
|
||||
for raw_text, segmented_text, annot_tuples in gold_tuples:
|
||||
if gold_preproc:
|
||||
tokens = nlp.tokenizer.tokens_from_list(gold_sent.words)
|
||||
nlp.tagger(tokens)
|
||||
nlp.parser(tokens)
|
||||
gold_sent.map_heads(nlp.parser.moves.label_ids)
|
||||
else:
|
||||
tokens = nlp(gold_sent.raw_text)
|
||||
loss += gold_sent.align_to_tokens(tokens, nlp.parser.moves.label_ids)
|
||||
for i, token in enumerate(tokens):
|
||||
pos_corr += token.tag_ == gold_sent.tags[i]
|
||||
n_tokens += 1
|
||||
if gold_sent.heads[i] is None:
|
||||
skipped += 1
|
||||
continue
|
||||
#print i, token.orth_, token.head.i, gold_sent.py_heads[i], gold_sent.labels[i],
|
||||
#print gold_sent.is_correct(i, token.head.i)
|
||||
if gold_sent.labels[i] != 'P':
|
||||
n_corr += gold_sent.is_correct(i, token.head.i)
|
||||
total += 1
|
||||
|
@ -263,12 +273,6 @@ def evaluate(Language, dev_loc, model_dir, gold_preproc=False):
|
|||
return float(n_corr) / (total + loss)
|
||||
|
||||
|
||||
def read_gold(loc, n=0):
|
||||
sent_strs = open(loc).read().strip().split('\n\n')
|
||||
if n == 0:
|
||||
n = len(sent_strs)
|
||||
return [GoldParse.from_docparse(sent) for sent in sent_strs[:n]]
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
train_loc=("Training file location",),
|
||||
|
@ -277,9 +281,9 @@ def read_gold(loc, n=0):
|
|||
n_sents=("Number of training sentences", "option", "n", int)
|
||||
)
|
||||
def main(train_loc, dev_loc, model_dir, n_sents=0):
|
||||
#train(English, read_gold(train_loc, n=n_sents), model_dir,
|
||||
# gold_preproc=False, force_gold=False)
|
||||
print evaluate(English, read_gold(dev_loc), model_dir, gold_preproc=False)
|
||||
train(English, train_loc, model_dir,
|
||||
gold_preproc=False, force_gold=False, n_sents=n_sents)
|
||||
print evaluate(English, dev_loc, model_dir, gold_preproc=False)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
Loading…
Reference in New Issue