2015-05-04 23:00:27 +00:00
|
|
|
"""Convert OntoNotes into a json format.
|
|
|
|
|
|
|
|
doc: {
|
|
|
|
id: string,
|
|
|
|
paragraphs: [{
|
|
|
|
raw: string,
|
|
|
|
segmented: string,
|
|
|
|
sents: [int],
|
|
|
|
tokens: [{
|
|
|
|
start: int,
|
|
|
|
tag: string,
|
|
|
|
head: int,
|
|
|
|
dep: string}],
|
|
|
|
brackets: [{
|
|
|
|
start: int,
|
|
|
|
end: int,
|
|
|
|
label: string,
|
|
|
|
flabel: int}]}]}
|
2015-05-06 14:31:00 +00:00
|
|
|
|
|
|
|
Consumes output of spacy/munge/align_raw.py
|
2015-05-04 23:00:27 +00:00
|
|
|
"""
|
|
|
|
import plac
|
|
|
|
import json
|
|
|
|
from os import path
|
|
|
|
import re
|
|
|
|
|
|
|
|
from spacy.munge import read_ptb
|
|
|
|
from spacy.munge import read_conll
|
|
|
|
|
|
|
|
|
|
|
|
def _iter_raw_files(raw_loc):
|
|
|
|
files = json.load(open(raw_loc))
|
|
|
|
for f in files:
|
|
|
|
yield f
|
|
|
|
|
|
|
|
|
|
|
|
def format_doc(section, filename, raw_paras, ptb_loc, dep_loc):
|
|
|
|
ptb_sents = read_ptb.split(open(ptb_loc).read())
|
|
|
|
dep_sents = read_conll.split(open(dep_loc).read())
|
|
|
|
|
|
|
|
assert len(ptb_sents) == len(dep_sents)
|
|
|
|
|
|
|
|
i = 0
|
2015-05-23 15:39:04 +00:00
|
|
|
doc = {'id': filename, 'paragraphs': []}
|
2015-05-04 23:00:27 +00:00
|
|
|
for raw_sents in raw_paras:
|
2015-05-24 00:50:14 +00:00
|
|
|
para = {
|
|
|
|
'raw': ' '.join(sent.replace('<SEP>', '') for sent in raw_sents),
|
|
|
|
'sents': [],
|
|
|
|
'tokens': [],
|
|
|
|
'brackets': []}
|
2015-05-06 14:31:00 +00:00
|
|
|
offset = 0
|
2015-05-04 23:00:27 +00:00
|
|
|
for raw_sent in raw_sents:
|
2015-05-05 00:31:20 +00:00
|
|
|
_, brackets = read_ptb.parse(ptb_sents[i], strip_bad_periods=True)
|
|
|
|
_, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True)
|
2015-05-24 00:50:14 +00:00
|
|
|
for token_id, token in enumerate(annot):
|
2015-05-04 23:00:27 +00:00
|
|
|
try:
|
2015-05-24 00:50:14 +00:00
|
|
|
head = (token['head'] + offset) if token['head'] != -1 else -1
|
2015-05-06 14:31:00 +00:00
|
|
|
para['tokens'].append({
|
2015-05-24 00:50:14 +00:00
|
|
|
'id': offset + token_id,
|
|
|
|
'orth': token['word'],
|
2015-05-04 23:00:27 +00:00
|
|
|
'tag': token['tag'],
|
|
|
|
'head': head,
|
|
|
|
'dep': token['dep']})
|
|
|
|
except:
|
|
|
|
raise
|
|
|
|
for label, start, end in brackets:
|
2015-05-05 00:31:20 +00:00
|
|
|
if start != end:
|
|
|
|
para['brackets'].append({'label': label,
|
2015-05-24 00:50:14 +00:00
|
|
|
'start': start + offset,
|
|
|
|
'end': (end-1) + offset})
|
2015-05-04 23:00:27 +00:00
|
|
|
i += 1
|
2015-05-24 00:50:14 +00:00
|
|
|
offset += len(annot)
|
|
|
|
para['sents'].append(offset)
|
2015-05-04 23:00:27 +00:00
|
|
|
doc['paragraphs'].append(para)
|
|
|
|
return doc
|
|
|
|
|
|
|
|
|
2015-05-05 00:31:20 +00:00
|
|
|
def main(onto_dir, raw_dir, out_dir):
|
2015-05-04 23:00:27 +00:00
|
|
|
for i in range(25):
|
|
|
|
section = str(i) if i >= 10 else ('0' + str(i))
|
|
|
|
raw_loc = path.join(raw_dir, 'wsj%s.json' % section)
|
2015-05-05 00:31:20 +00:00
|
|
|
docs = []
|
2015-05-23 15:39:04 +00:00
|
|
|
for j, (filename, raw_paras) in enumerate(_iter_raw_files(raw_loc)):
|
2015-05-04 23:00:27 +00:00
|
|
|
if section == '00':
|
|
|
|
j += 1
|
|
|
|
if section == '04' and filename == '55':
|
|
|
|
continue
|
2015-05-23 15:39:04 +00:00
|
|
|
ptb_loc = path.join(onto_dir, section, '%s.parse' % filename)
|
|
|
|
dep_loc = ptb_loc + '.dep'
|
2015-05-04 23:00:27 +00:00
|
|
|
if path.exists(ptb_loc) and path.exists(dep_loc):
|
|
|
|
doc = format_doc(section, filename, raw_paras, ptb_loc, dep_loc)
|
|
|
|
docs.append(doc)
|
2015-05-05 00:31:20 +00:00
|
|
|
with open(path.join(out_dir, '%s.json' % section), 'w') as file_:
|
2015-05-23 15:39:04 +00:00
|
|
|
json.dump(docs, file_, indent=4)
|
2015-05-04 23:00:27 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
plac.call(main)
|
|
|
|
|