Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2017-05-27 08:20:40 -05:00 · 2017-05-27 08:20:40 -05:00 · dc07d72d80
parent de13fe0305 d06f235fc9
commit dc07d72d80
3 changed files with 6 additions and 4 deletions
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -7,7 +7,6 @@ from pathlib import Path
 from .converters import conllu2json, iob2json
 from ..util import prints

-
 # Converters are matched by file extension. To add a converter, add a new entry
 # to this dict with the file extension mapped to the converter function imported
 # from /converters.
@ -39,4 +38,5 @@ def convert(_, input_file, output_dir, n_sents, morphology):
    if not file_ext in CONVERTERS:
        prints("Can't find converter for %s" % input_path.parts[-1],
               title="Unknown format", exits=1)
-    CONVERTERS[file_ext](input_path, output_path, n_sents, morphology)
+    CONVERTERS[file_ext](input_path, output_path,
+            n_sents=n_sents, morphology=morphology)
--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/cli/converters/iob2json.py
@ -3,6 +3,7 @@ from __future__ import unicode_literals

 from ...compat import json_dumps, path2str
 from ...util import prints
+from ...gold import iob_to_biluo


 def iob2json(input_path, output_path, n_sents=10, *a, **k):
@ -29,9 +30,10 @@ def read_iob(file_):
            continue
        tokens = [t.rsplit('|', 2) for t in line.split()]
        words, pos, iob = zip(*tokens)
+        biluo = iob_to_biluo(iob)
        sentences.append([
            {'orth': w, 'tag': p, 'ner': ent}
-            for (w, p, ent) in zip(words, pos, iob)
+            for (w, p, ent) in zip(words, pos, biluo)
        ])
    sentences = [{'tokens': sent} for sent in sentences]
    paragraphs = [{'sentences': [sent]} for sent in sentences]
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -306,7 +306,7 @@ def read_json_file(loc, docs_filter=None, limit=None):
                    yield [paragraph.get('raw', None), sents]


-def _iob_to_biluo(tags):
+def iob_to_biluo(tags):
    out = []
    curr_label = None
    tags = list(tags)