mirror of https://github.com/explosion/spaCy.git
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
dc07d72d80
|
@ -7,7 +7,6 @@ from pathlib import Path
|
||||||
from .converters import conllu2json, iob2json
|
from .converters import conllu2json, iob2json
|
||||||
from ..util import prints
|
from ..util import prints
|
||||||
|
|
||||||
|
|
||||||
# Converters are matched by file extension. To add a converter, add a new entry
|
# Converters are matched by file extension. To add a converter, add a new entry
|
||||||
# to this dict with the file extension mapped to the converter function imported
|
# to this dict with the file extension mapped to the converter function imported
|
||||||
# from /converters.
|
# from /converters.
|
||||||
|
@ -39,4 +38,5 @@ def convert(_, input_file, output_dir, n_sents, morphology):
|
||||||
if not file_ext in CONVERTERS:
|
if not file_ext in CONVERTERS:
|
||||||
prints("Can't find converter for %s" % input_path.parts[-1],
|
prints("Can't find converter for %s" % input_path.parts[-1],
|
||||||
title="Unknown format", exits=1)
|
title="Unknown format", exits=1)
|
||||||
CONVERTERS[file_ext](input_path, output_path, n_sents, morphology)
|
CONVERTERS[file_ext](input_path, output_path,
|
||||||
|
n_sents=n_sents, morphology=morphology)
|
||||||
|
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...compat import json_dumps, path2str
|
from ...compat import json_dumps, path2str
|
||||||
from ...util import prints
|
from ...util import prints
|
||||||
|
from ...gold import iob_to_biluo
|
||||||
|
|
||||||
|
|
||||||
def iob2json(input_path, output_path, n_sents=10, *a, **k):
|
def iob2json(input_path, output_path, n_sents=10, *a, **k):
|
||||||
|
@ -29,9 +30,10 @@ def read_iob(file_):
|
||||||
continue
|
continue
|
||||||
tokens = [t.rsplit('|', 2) for t in line.split()]
|
tokens = [t.rsplit('|', 2) for t in line.split()]
|
||||||
words, pos, iob = zip(*tokens)
|
words, pos, iob = zip(*tokens)
|
||||||
|
biluo = iob_to_biluo(iob)
|
||||||
sentences.append([
|
sentences.append([
|
||||||
{'orth': w, 'tag': p, 'ner': ent}
|
{'orth': w, 'tag': p, 'ner': ent}
|
||||||
for (w, p, ent) in zip(words, pos, iob)
|
for (w, p, ent) in zip(words, pos, biluo)
|
||||||
])
|
])
|
||||||
sentences = [{'tokens': sent} for sent in sentences]
|
sentences = [{'tokens': sent} for sent in sentences]
|
||||||
paragraphs = [{'sentences': [sent]} for sent in sentences]
|
paragraphs = [{'sentences': [sent]} for sent in sentences]
|
||||||
|
|
|
@ -306,7 +306,7 @@ def read_json_file(loc, docs_filter=None, limit=None):
|
||||||
yield [paragraph.get('raw', None), sents]
|
yield [paragraph.get('raw', None), sents]
|
||||||
|
|
||||||
|
|
||||||
def _iob_to_biluo(tags):
|
def iob_to_biluo(tags):
|
||||||
out = []
|
out = []
|
||||||
curr_label = None
|
curr_label = None
|
||||||
tags = list(tags)
|
tags = list(tags)
|
||||||
|
|
Loading…
Reference in New Issue