From a6352403982da4211cb83a80ae8bdee2fc861a7b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Oct 2017 22:03:26 -0500 Subject: [PATCH 1/4] Add conll_ner2json converter --- spacy/cli/converters/conll_ner2json.py | 50 ++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 spacy/cli/converters/conll_ner2json.py diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/cli/converters/conll_ner2json.py new file mode 100644 index 000000000..e3bd82e7e --- /dev/null +++ b/spacy/cli/converters/conll_ner2json.py @@ -0,0 +1,50 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...compat import json_dumps, path2str +from ...util import prints +from ...gold import iob_to_biluo + + +def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False): + """ + Convert files in the CoNLL-2003 NER format into JSON format for use with train cli. + """ + docs = read_conll_ner(input_path) + + output_filename = input_path.parts[-1].replace(".conll", "") + ".json" + output_filename = input_path.parts[-1].replace(".conll", "") + ".json" + output_file = output_path / output_filename + with output_file.open('w', encoding='utf-8') as f: + f.write(json_dumps(docs)) + prints("Created %d documents" % len(docs), + title="Generated output file %s" % path2str(output_file)) + + +def read_conll_ner(input_path): + text = input_path.open('r', encoding='utf-8').read() + i = 0 + delimit_docs = '-DOCSTART- -X- O O' + output_docs = [] + for doc in text.strip().split(delimit_docs): + doc = doc.strip() + if not doc: + continue + output_doc = [] + for sent in doc.split('\n\n'): + sent = sent.strip() + if not sent: + continue + lines = [line.strip() for line in sent.split('\n') if line.strip()] + words, tags, chunks, iob_ents = zip(*[line.split() for line in lines]) + biluo_ents = iob_to_biluo(iob_ents) + output_doc.append({'tokens': [ + {'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in + zip(words, tags, biluo_ents) + ]}) + output_docs.append({ + 'id': len(output_docs), + 'paragraphs': [{'sentences': output_doc}] + }) + output_doc = [] + return output_docs From 97c9b5db8b6219d53967a136fa9fdd63bd06fca5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Oct 2017 23:41:16 -0500 Subject: [PATCH 2/4] Patch spacy.train for new pipeline management --- spacy/cli/train.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index b605f4e61..35ce4c43b 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -88,9 +88,11 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, n_train_words = corpus.count_train() lang_class = util.get_lang_class(lang) - nlp = lang_class(pipeline=pipeline) + nlp = lang_class() if vectors: util.load_model(vectors, vocab=nlp.vocab) + for name in pipeline: + nlp.add_pipe(nlp.create_pipe(name), name=name) optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) nlp._optimizer = None @@ -113,6 +115,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, epoch_model_path = output_path / ('model%d' % i) nlp.to_disk(epoch_model_path) nlp_loaded = lang_class(pipeline=pipeline) + for name in pipeline: + nlp_loaded.add_pipe(nlp.create_pipe(name), name=name) nlp_loaded = nlp_loaded.from_disk(epoch_model_path) dev_docs = list(corpus.dev_docs( nlp_loaded, @@ -128,6 +132,9 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, gpu_wps = nwords/(end_time-start_time) with Model.use_device('cpu'): nlp_loaded = lang_class(pipeline=pipeline) + for name in pipeline: + nlp_loaded.add_pipe(nlp.create_pipe(name), name=name) + nlp_loaded = nlp_loaded.from_disk(epoch_model_path) dev_docs = list(corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc)) From 8143618497399543cbceb8c895cc071961094d43 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 10 Oct 2017 19:32:54 +0200 Subject: [PATCH 3/4] Set prefix length back to 1 --- spacy/lang/lex_attrs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py index 63695d8a1..d4beebd26 100644 --- a/spacy/lang/lex_attrs.py +++ b/spacy/lang/lex_attrs.py @@ -126,7 +126,7 @@ def word_shape(text): LEX_ATTRS = { attrs.LOWER: lambda string: string.lower(), attrs.NORM: lambda string: string.lower(), - attrs.PREFIX: lambda string: string[:3], + attrs.PREFIX: lambda string: string[0], attrs.SUFFIX: lambda string: string[-3:], attrs.CLUSTER: lambda string: 0, attrs.IS_ALPHA: lambda string: string.isalpha(), From 5156074df17ee361e1d1444d48118886012b9911 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 10 Oct 2017 12:51:20 -0500 Subject: [PATCH 4/4] Make loading code more consistent in train command --- spacy/cli/train.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 35ce4c43b..05d035769 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -114,10 +114,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, util.set_env_log(False) epoch_model_path = output_path / ('model%d' % i) nlp.to_disk(epoch_model_path) - nlp_loaded = lang_class(pipeline=pipeline) - for name in pipeline: - nlp_loaded.add_pipe(nlp.create_pipe(name), name=name) - nlp_loaded = nlp_loaded.from_disk(epoch_model_path) + nlp_loaded = util.load_model_from_path(epoch_model_path) dev_docs = list(corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc)) @@ -131,11 +128,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, else: gpu_wps = nwords/(end_time-start_time) with Model.use_device('cpu'): - nlp_loaded = lang_class(pipeline=pipeline) - for name in pipeline: - nlp_loaded.add_pipe(nlp.create_pipe(name), name=name) - - nlp_loaded = nlp_loaded.from_disk(epoch_model_path) + nlp_loaded = util.load_model_from_path(epoch_model_path) dev_docs = list(corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc)) start_time = timer()