mirror of https://github.com/explosion/spaCy.git
Merge branch 'feature/docs_to_json' into develop
This commit is contained in:
commit
ea2edd1e2c
|
@ -563,6 +563,32 @@ cdef class GoldParse:
|
||||||
self.c.sent_start[i] = 0
|
self.c.sent_start[i] = 0
|
||||||
|
|
||||||
|
|
||||||
|
def docs_to_json(id, docs):
|
||||||
|
'''Convert a list of Doc objects into the JSON-serializable format used by
|
||||||
|
the spacy train command. Each Doc in the list will be interpreted as a
|
||||||
|
paragraph.
|
||||||
|
'''
|
||||||
|
if isinstance(docs, Doc):
|
||||||
|
docs = [docs]
|
||||||
|
json_doc = {'id': id, 'paragraphs': []}
|
||||||
|
for i, doc in enumerate(docs):
|
||||||
|
json_para = {'raw': doc.text, 'sentences': []}
|
||||||
|
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
||||||
|
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
|
||||||
|
for j, sent in enumerate(doc.sents):
|
||||||
|
json_sent = {'tokens': [], 'brackets': []}
|
||||||
|
for token in sent:
|
||||||
|
json_token = {"id": token.i, "orth": token.text}
|
||||||
|
json_token['tag'] = token.tag_ if doc.is_tagged else None
|
||||||
|
json_token['head'] = (token.head.i-token.i) if doc.is_parsed else None
|
||||||
|
json_token['dep'] = token.dep_ if doc.is_parsed else None
|
||||||
|
json_token['ner'] = biluo_tags[token.i]
|
||||||
|
json_sent['tokens'].append(json_token)
|
||||||
|
json_para['sentences'].append(json_sent)
|
||||||
|
json_doc['paragraphs'].append(json_para)
|
||||||
|
return json_doc
|
||||||
|
|
||||||
|
|
||||||
def biluo_tags_from_offsets(doc, entities, missing='O'):
|
def biluo_tags_from_offsets(doc, entities, missing='O'):
|
||||||
"""Encode labelled spans into per-token tags, using the
|
"""Encode labelled spans into per-token tags, using the
|
||||||
Begin/In/Last/Unit/Out scheme (BILUO).
|
Begin/In/Last/Unit/Out scheme (BILUO).
|
||||||
|
|
|
@ -2,7 +2,9 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
||||||
|
from spacy.gold import docs_to_json
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
from .util import get_doc
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_U(en_vocab):
|
def test_gold_biluo_U(en_vocab):
|
||||||
|
@ -50,3 +52,31 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
|
||||||
assert biluo_tags_converted == biluo_tags
|
assert biluo_tags_converted == biluo_tags
|
||||||
offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
|
offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
|
||||||
assert offsets_converted == offsets
|
assert offsets_converted == offsets
|
||||||
|
|
||||||
|
def test_docs_to_json(en_vocab):
|
||||||
|
'''Test we can convert a list of Doc objects into the JSON-serializable
|
||||||
|
format we use for training.
|
||||||
|
'''
|
||||||
|
docs = [
|
||||||
|
get_doc(
|
||||||
|
en_vocab,
|
||||||
|
words=['a', 'b'],
|
||||||
|
pos=['VBP', 'NN'],
|
||||||
|
heads=[0, -1],
|
||||||
|
deps=['ROOT', 'dobj'],
|
||||||
|
ents=[]),
|
||||||
|
get_doc(
|
||||||
|
en_vocab,
|
||||||
|
words=['c', 'd', 'e'],
|
||||||
|
pos=['VBP', 'NN', 'NN'],
|
||||||
|
heads=[0, -1, -2],
|
||||||
|
deps=['ROOT', 'dobj', 'dobj'],
|
||||||
|
ents=[(1, 2, 'ORG')]),
|
||||||
|
]
|
||||||
|
json_doc = docs_to_json(0, docs)
|
||||||
|
assert json_doc['id'] == 0
|
||||||
|
assert len(json_doc['paragraphs']) == 2
|
||||||
|
assert len(json_doc['paragraphs'][0]['sentences']) == 1
|
||||||
|
assert len(json_doc['paragraphs'][1]['sentences']) == 1
|
||||||
|
assert len(json_doc['paragraphs'][0]['sentences'][0]['tokens']) == 2
|
||||||
|
assert len(json_doc['paragraphs'][1]['sentences'][0]['tokens']) == 3
|
||||||
|
|
Loading…
Reference in New Issue