From 5a87bcf35f78a88173280918ab5908278ae8a7a6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 26 May 2017 11:32:34 -0500 Subject: [PATCH 1/3] Fix converters --- spacy/cli/converters/iob2json.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py index 45393dd80..c2e944c0a 100644 --- a/spacy/cli/converters/iob2json.py +++ b/spacy/cli/converters/iob2json.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from ...compat import json_dumps, path2str from ...util import prints +from ...gold import iob_to_biluo def iob2json(input_path, output_path, n_sents=10, *a, **k): @@ -29,9 +30,10 @@ def read_iob(file_): continue tokens = [t.rsplit('|', 2) for t in line.split()] words, pos, iob = zip(*tokens) + biluo = iob_to_biluo(iob) sentences.append([ {'orth': w, 'tag': p, 'ner': ent} - for (w, p, ent) in zip(words, pos, iob) + for (w, p, ent) in zip(words, pos, biluo) ]) sentences = [{'tokens': sent} for sent in sentences] paragraphs = [{'sentences': [sent]} for sent in sentences] From 2b3b937a04622d13e30204ff4553d6815a841289 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 26 May 2017 11:32:41 -0500 Subject: [PATCH 2/3] Fix converter CLI --- spacy/cli/convert.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index c7730ab9e..e95ffd08b 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -7,7 +7,6 @@ from pathlib import Path from .converters import conllu2json, iob2json from ..util import prints - # Converters are matched by file extension. To add a converter, add a new entry # to this dict with the file extension mapped to the converter function imported # from /converters. @@ -25,7 +24,7 @@ CONVERTERS = { n_sents=("Number of sentences per doc", "option", "n", float), morphology=("Enable appending morphology to tags", "flag", "m", bool) ) -def convert(input_file, output_dir, n_sents, morphology): +def convert(_, input_file, output_dir, n_sents, morphology): """Convert files into JSON format for use with train command and other experiment management functions. """ @@ -39,4 +38,5 @@ def convert(input_file, output_dir, n_sents, morphology): if not file_ext in CONVERTERS: prints("Can't find converter for %s" % input_path.parts[-1], title="Unknown format", exits=1) - CONVERTERS[file_ext](input_path, output_path, *args) + CONVERTERS[file_ext](input_path, output_path, + n_sents=n_sents, morphology=morphology) From 2e587c641734c4110e0c0154ddc8e04c68a5a83f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 26 May 2017 11:32:55 -0500 Subject: [PATCH 3/3] Export iob_to_biluo utility --- spacy/gold.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 579010e6d..f9500dbb6 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -305,7 +305,7 @@ def read_json_file(loc, docs_filter=None, limit=None): yield [paragraph.get('raw', None), sents] -def _iob_to_biluo(tags): +def iob_to_biluo(tags): out = [] curr_label = None tags = list(tags)