spaCy/bin/munge_ewtb.py

#!/usr/bin/env python
from __future__ import unicode_literals

from xml.etree import cElementTree as ElementTree
import json
import re

import plac
from pathlib import Path
from os import path


escaped_tokens = {
    '-LRB-': '(',
    '-RRB-': ')',
    '-LSB-': '[',
    '-RSB-': ']',
    '-LCB-': '{',
    '-RCB-': '}',
}

def read_parses(parse_loc):
    offset = 0
    doc = []
    for parse in open(str(parse_loc) + '.dep').read().strip().split('\n\n'):
        parse = _adjust_token_ids(parse, offset)
        offset += len(parse.split('\n'))
        doc.append(parse)
    return doc

def _adjust_token_ids(parse, offset):
    output = []
    for line in parse.split('\n'):
        pieces = line.split()
        pieces[0] = str(int(pieces[0]) + offset)
        pieces[5] = str(int(pieces[5]) + offset) if pieces[5] != '0' else '0'
        output.append('\t'.join(pieces))
    return '\n'.join(output)


def _fmt_doc(filename, paras):
    return {'id': filename, 'paragraphs': [_fmt_para(*para) for para in paras]}


def _fmt_para(raw, sents):
    return {'raw': raw, 'sentences': [_fmt_sent(sent) for sent in sents]}


def _fmt_sent(sent):
    return {
        'tokens': [_fmt_token(*t.split()) for t in sent.strip().split('\n')],
        'brackets': []}


def _fmt_token(id_, word, hyph, pos, ner, head, dep, blank1, blank2, blank3):
    head = int(head) - 1
    id_ = int(id_) - 1
    head = (head - id_) if head != -1 else 0
    return {'id': id_, 'orth': word, 'tag': pos, 'dep': dep, 'head': head}


tags_re = re.compile(r'<[\w\?/][^>]+>')
def main(out_dir, ewtb_dir='/usr/local/data/eng_web_tbk'):
    ewtb_dir = Path(ewtb_dir)
    out_dir = Path(out_dir)
    if not out_dir.exists():
        out_dir.mkdir()
    for genre_dir in ewtb_dir.joinpath('data').iterdir():
        #if 'answers' in str(genre_dir): continue
        parse_dir = genre_dir.joinpath('penntree')
        docs = []
        for source_loc in genre_dir.joinpath('source').joinpath('source_original').iterdir():
            filename = source_loc.parts[-1].replace('.sgm.sgm', '')
            filename = filename.replace('.xml', '')
            filename = filename.replace('.txt', '')
            parse_loc = parse_dir.joinpath(filename + '.xml.tree')
            parses = read_parses(parse_loc)
            source = source_loc.open().read().strip()
            if 'answers' in str(genre_dir):
                source = tags_re.sub('', source).strip()
            docs.append(_fmt_doc(filename, [[source, parses]]))

        out_loc = out_dir.joinpath(genre_dir.parts[-1] + '.json')
        with open(str(out_loc), 'w') as out_file:
            out_file.write(json.dumps(docs, indent=4))


if __name__ == '__main__':
    plac.call(main)
* Add tb pre-process script 2015-06-05 23:59:44 +00:00			`#!/usr/bin/env python`
			`from __future__ import unicode_literals`

			`from xml.etree import cElementTree as ElementTree`
			`import json`
			`import re`

			`import plac`
			`from pathlib import Path`
			`from os import path`


			`escaped_tokens = {`
			`'-LRB-': '(',`
			`'-RRB-': ')',`
			`'-LSB-': '[',`
			`'-RSB-': ']',`
			`'-LCB-': '{',`
			`'-RCB-': '}',`
			`}`

			`def read_parses(parse_loc):`
			`offset = 0`
			`doc = []`
			`for parse in open(str(parse_loc) + '.dep').read().strip().split('\n\n'):`
			`parse = _adjust_token_ids(parse, offset)`
			`offset += len(parse.split('\n'))`
			`doc.append(parse)`
			`return doc`

			`def _adjust_token_ids(parse, offset):`
			`output = []`
			`for line in parse.split('\n'):`
			`pieces = line.split()`
			`pieces[0] = str(int(pieces[0]) + offset)`
			`pieces[5] = str(int(pieces[5]) + offset) if pieces[5] != '0' else '0'`
			`output.append('\t'.join(pieces))`
			`return '\n'.join(output)`


			`def _fmt_doc(filename, paras):`
			`return {'id': filename, 'paragraphs': [_fmt_para(*para) for para in paras]}`


			`def _fmt_para(raw, sents):`
* Upd munge_ewtb for the new json format 2015-06-06 00:10:33 +00:00			`return {'raw': raw, 'sentences': [_fmt_sent(sent) for sent in sents]}`


			`def _fmt_sent(sent):`
			`return {`
			`'tokens': [_fmt_token(*t.split()) for t in sent.strip().split('\n')],`
			`'brackets': []}`
* Add tb pre-process script 2015-06-05 23:59:44 +00:00

			`def _fmt_token(id_, word, hyph, pos, ner, head, dep, blank1, blank2, blank3):`
* Upd munge_ewtb for the new json format 2015-06-06 00:10:33 +00:00			`head = int(head) - 1`
			`id_ = int(id_) - 1`
			`head = (head - id_) if head != -1 else 0`
			`return {'id': id_, 'orth': word, 'tag': pos, 'dep': dep, 'head': head}`
* Add tb pre-process script 2015-06-05 23:59:44 +00:00

			`tags_re = re.compile(r'<[\w\?/][^>]+>')`
			`def main(out_dir, ewtb_dir='/usr/local/data/eng_web_tbk'):`
			`ewtb_dir = Path(ewtb_dir)`
			`out_dir = Path(out_dir)`
			`if not out_dir.exists():`
			`out_dir.mkdir()`
			`for genre_dir in ewtb_dir.joinpath('data').iterdir():`
			`#if 'answers' in str(genre_dir): continue`
			`parse_dir = genre_dir.joinpath('penntree')`
			`docs = []`
			`for source_loc in genre_dir.joinpath('source').joinpath('source_original').iterdir():`
			`filename = source_loc.parts[-1].replace('.sgm.sgm', '')`
			`filename = filename.replace('.xml', '')`
			`filename = filename.replace('.txt', '')`
			`parse_loc = parse_dir.joinpath(filename + '.xml.tree')`
			`parses = read_parses(parse_loc)`
			`source = source_loc.open().read().strip()`
			`if 'answers' in str(genre_dir):`
			`source = tags_re.sub('', source).strip()`
			`docs.append(_fmt_doc(filename, [[source, parses]]))`

			`out_loc = out_dir.joinpath(genre_dir.parts[-1] + '.json')`
			`with open(str(out_loc), 'w') as out_file:`
			`out_file.write(json.dumps(docs, indent=4))`


			`if __name__ == '__main__':`
			`plac.call(main)`