diff --git a/spacy/__main__.py b/spacy/__main__.py index 7038a7a86..7151e3c74 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -10,12 +10,13 @@ from spacy.cli import info as cli_info from spacy.cli import package as cli_package from spacy.cli import train as cli_train from spacy.cli import model as cli_model +from spacy.cli import convert as cli_convert class CLI(object): """Command-line interface for spaCy""" - commands = ('download', 'link', 'info', 'package', 'train', 'model') + commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert') @plac.annotations( model=("model to download (shortcut or model name)", "positional", None, str), @@ -110,6 +111,20 @@ class CLI(object): cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data) + @plac.annotations( + input_file=("input file", "positional", None, str), + output_dir=("output directory for converted file", "positional", None, str), + n_sents=("Number of sentences per doc", "option", "n", float), + morphology=("Enable appending morphology to tags", "flag", "m", bool) + ) + def convert(self, input_file, output_dir, n_sents=10, morphology=False): + """ + Convert files into JSON format for use with train command and other + experiment management functions. + """ + + cli_convert(input_file, output_dir, n_sents, morphology) + def __missing__(self, name): print("\n Command %r does not exist." diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index b97279dec..d529096ef 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -4,3 +4,4 @@ from .link import link from .package import package from .train import train, train_config from .model import model +from .convert import convert diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py new file mode 100644 index 000000000..34df5483b --- /dev/null +++ b/spacy/cli/convert.py @@ -0,0 +1,37 @@ +# coding: utf8 +from __future__ import unicode_literals, division, print_function + +import io +from pathlib import Path, PurePosixPath + +from .converters import conllu2json +from .. import util + + +# Converters are matched by file extension. To add a converter, add a new entry +# to this dict with the file extension mapped to the converter function imported +# from /converters. + +CONVERTERS = { + '.conllu': conllu2json +} + + +def convert(input_file, output_dir, *args): + input_path = Path(input_file) + output_path = Path(output_dir) + check_dirs(input_path, output_path) + file_ext = input_path.suffix + + if file_ext in CONVERTERS: + CONVERTERS[file_ext](input_path, output_path, *args) + else: + util.sys_exit("Can't find converter for {}".format(input_path.parts[-1]), + title="Unknown format") + + +def check_dirs(input_file, output_path): + if not input_file.exists(): + util.sys_exit(input_file.as_posix(), title="Input file not found") + if not output_path.exists(): + util.sys_exit(output_path.as_posix(), title="Output directory not found")