diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 847051e3f..e95ffd08b 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -7,7 +7,6 @@ from pathlib import Path from .converters import conllu2json, iob2json from ..util import prints - # Converters are matched by file extension. To add a converter, add a new entry # to this dict with the file extension mapped to the converter function imported # from /converters. @@ -39,4 +38,5 @@ def convert(_, input_file, output_dir, n_sents, morphology): if not file_ext in CONVERTERS: prints("Can't find converter for %s" % input_path.parts[-1], title="Unknown format", exits=1) - CONVERTERS[file_ext](input_path, output_path, n_sents, morphology) + CONVERTERS[file_ext](input_path, output_path, + n_sents=n_sents, morphology=morphology) diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py index 45393dd80..c2e944c0a 100644 --- a/spacy/cli/converters/iob2json.py +++ b/spacy/cli/converters/iob2json.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from ...compat import json_dumps, path2str from ...util import prints +from ...gold import iob_to_biluo def iob2json(input_path, output_path, n_sents=10, *a, **k): @@ -29,9 +30,10 @@ def read_iob(file_): continue tokens = [t.rsplit('|', 2) for t in line.split()] words, pos, iob = zip(*tokens) + biluo = iob_to_biluo(iob) sentences.append([ {'orth': w, 'tag': p, 'ner': ent} - for (w, p, ent) in zip(words, pos, iob) + for (w, p, ent) in zip(words, pos, biluo) ]) sentences = [{'tokens': sent} for sent in sentences] paragraphs = [{'sentences': [sent]} for sent in sentences] diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 558e4e008..faf135b00 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -306,7 +306,7 @@ def read_json_file(loc, docs_filter=None, limit=None): yield [paragraph.get('raw', None), sents] -def _iob_to_biluo(tags): +def iob_to_biluo(tags): out = [] curr_label = None tags = list(tags)