mirror of https://github.com/explosion/spaCy.git
Merge branch 'develop' into feature/dot-underscore
This commit is contained in:
commit
bfd58dd0fc
|
@ -0,0 +1,50 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...compat import json_dumps, path2str
|
||||||
|
from ...util import prints
|
||||||
|
from ...gold import iob_to_biluo
|
||||||
|
|
||||||
|
|
||||||
|
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
|
||||||
|
"""
|
||||||
|
Convert files in the CoNLL-2003 NER format into JSON format for use with train cli.
|
||||||
|
"""
|
||||||
|
docs = read_conll_ner(input_path)
|
||||||
|
|
||||||
|
output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
|
||||||
|
output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
|
||||||
|
output_file = output_path / output_filename
|
||||||
|
with output_file.open('w', encoding='utf-8') as f:
|
||||||
|
f.write(json_dumps(docs))
|
||||||
|
prints("Created %d documents" % len(docs),
|
||||||
|
title="Generated output file %s" % path2str(output_file))
|
||||||
|
|
||||||
|
|
||||||
|
def read_conll_ner(input_path):
|
||||||
|
text = input_path.open('r', encoding='utf-8').read()
|
||||||
|
i = 0
|
||||||
|
delimit_docs = '-DOCSTART- -X- O O'
|
||||||
|
output_docs = []
|
||||||
|
for doc in text.strip().split(delimit_docs):
|
||||||
|
doc = doc.strip()
|
||||||
|
if not doc:
|
||||||
|
continue
|
||||||
|
output_doc = []
|
||||||
|
for sent in doc.split('\n\n'):
|
||||||
|
sent = sent.strip()
|
||||||
|
if not sent:
|
||||||
|
continue
|
||||||
|
lines = [line.strip() for line in sent.split('\n') if line.strip()]
|
||||||
|
words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
|
||||||
|
biluo_ents = iob_to_biluo(iob_ents)
|
||||||
|
output_doc.append({'tokens': [
|
||||||
|
{'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in
|
||||||
|
zip(words, tags, biluo_ents)
|
||||||
|
]})
|
||||||
|
output_docs.append({
|
||||||
|
'id': len(output_docs),
|
||||||
|
'paragraphs': [{'sentences': output_doc}]
|
||||||
|
})
|
||||||
|
output_doc = []
|
||||||
|
return output_docs
|
|
@ -88,9 +88,11 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
|
||||||
n_train_words = corpus.count_train()
|
n_train_words = corpus.count_train()
|
||||||
|
|
||||||
lang_class = util.get_lang_class(lang)
|
lang_class = util.get_lang_class(lang)
|
||||||
nlp = lang_class(pipeline=pipeline)
|
nlp = lang_class()
|
||||||
if vectors:
|
if vectors:
|
||||||
util.load_model(vectors, vocab=nlp.vocab)
|
util.load_model(vectors, vocab=nlp.vocab)
|
||||||
|
for name in pipeline:
|
||||||
|
nlp.add_pipe(nlp.create_pipe(name), name=name)
|
||||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||||
nlp._optimizer = None
|
nlp._optimizer = None
|
||||||
|
|
||||||
|
@ -112,8 +114,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
|
||||||
util.set_env_log(False)
|
util.set_env_log(False)
|
||||||
epoch_model_path = output_path / ('model%d' % i)
|
epoch_model_path = output_path / ('model%d' % i)
|
||||||
nlp.to_disk(epoch_model_path)
|
nlp.to_disk(epoch_model_path)
|
||||||
nlp_loaded = lang_class(pipeline=pipeline)
|
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||||
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
|
||||||
dev_docs = list(corpus.dev_docs(
|
dev_docs = list(corpus.dev_docs(
|
||||||
nlp_loaded,
|
nlp_loaded,
|
||||||
gold_preproc=gold_preproc))
|
gold_preproc=gold_preproc))
|
||||||
|
@ -127,8 +128,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
|
||||||
else:
|
else:
|
||||||
gpu_wps = nwords/(end_time-start_time)
|
gpu_wps = nwords/(end_time-start_time)
|
||||||
with Model.use_device('cpu'):
|
with Model.use_device('cpu'):
|
||||||
nlp_loaded = lang_class(pipeline=pipeline)
|
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||||
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
|
||||||
dev_docs = list(corpus.dev_docs(
|
dev_docs = list(corpus.dev_docs(
|
||||||
nlp_loaded, gold_preproc=gold_preproc))
|
nlp_loaded, gold_preproc=gold_preproc))
|
||||||
start_time = timer()
|
start_time = timer()
|
||||||
|
|
|
@ -126,7 +126,7 @@ def word_shape(text):
|
||||||
LEX_ATTRS = {
|
LEX_ATTRS = {
|
||||||
attrs.LOWER: lambda string: string.lower(),
|
attrs.LOWER: lambda string: string.lower(),
|
||||||
attrs.NORM: lambda string: string.lower(),
|
attrs.NORM: lambda string: string.lower(),
|
||||||
attrs.PREFIX: lambda string: string[:3],
|
attrs.PREFIX: lambda string: string[0],
|
||||||
attrs.SUFFIX: lambda string: string[-3:],
|
attrs.SUFFIX: lambda string: string[-3:],
|
||||||
attrs.CLUSTER: lambda string: 0,
|
attrs.CLUSTER: lambda string: 0,
|
||||||
attrs.IS_ALPHA: lambda string: string.isalpha(),
|
attrs.IS_ALPHA: lambda string: string.isalpha(),
|
||||||
|
|
Loading…
Reference in New Issue