From 744f06abf541a5df8a1dd6ea0eaeb22c9282ef74 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 24 May 2015 21:49:58 +0200 Subject: [PATCH] * Add script to read OntoNotes source documents --- spacy/munge/read_ontonotes.py | 47 +++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 spacy/munge/read_ontonotes.py diff --git a/spacy/munge/read_ontonotes.py b/spacy/munge/read_ontonotes.py new file mode 100644 index 000000000..38c3c780e --- /dev/null +++ b/spacy/munge/read_ontonotes.py @@ -0,0 +1,47 @@ +import re + + +docid_re = re.compile(r'([^>]+)') +doctype_re = re.compile(r'([^>]+)') +datetime_re = re.compile(r'([^>]+)') +headline_re = re.compile(r'(.+)', re.DOTALL) +post_re = re.compile(r'(.+)', re.DOTALL) +poster_re = re.compile(r'(.+)') +postdate_re = re.compile(r'(.+)') +tag_re = re.compile(r'<[^>]+>[^>]+]+>') + + +def sgml_extract(text_data): + """Extract text from the OntoNotes web documents. + + Format: + [{ + docid: string, + doctype: string, + datetime: string, + poster: string, + postdate: string + text: [string] + }] + """ + return { + 'docid': _get_one(docid_re, text_data, required=True), + 'doctype': _get_one(doctype_re, text_data, required=True), + 'datetime': _get_one(datetime_re, text_data, required=True), + 'headline': _get_one(headline_re, text_data, required=True), + 'poster': _get_one(poster_re, _get_one(post_re, text_data)), + 'postdate': _get_one(postdate_re, _get_one(post_re, text_data)), + 'text': _get_text(_get_one(post_re, text_data)).strip() + } + + +def _get_one(regex, text, required=False): + matches = regex.search(text) + if not matches and not required: + return '' + assert len(matches.groups()) == 1, matches + return matches.groups()[0].strip() + + +def _get_text(data): + return tag_re.sub('', data).replace('

', '').replace('

', '')