spaCy/spacy/munge/read_ontonotes.py

48 lines
1.5 KiB
Python

import re
docid_re = re.compile(r'<DOCID>([^>]+)</DOCID>')
doctype_re = re.compile(r'<DOCTYPE SOURCE="[^"]+">([^>]+)</DOCTYPE>')
datetime_re = re.compile(r'<DATETIME>([^>]+)</DATETIME>')
headline_re = re.compile(r'<HEADLINE>(.+)</HEADLINE>', re.DOTALL)
post_re = re.compile(r'<POST>(.+)</POST>', re.DOTALL)
poster_re = re.compile(r'<POSTER>(.+)</POSTER>')
postdate_re = re.compile(r'<POSTDATE>(.+)</POSTDATE>')
tag_re = re.compile(r'<[^>]+>[^>]+</[^>]+>')
def sgml_extract(text_data):
"""Extract text from the OntoNotes web documents.
Format:
[{
docid: string,
doctype: string,
datetime: string,
poster: string,
postdate: string
text: [string]
}]
"""
return {
'docid': _get_one(docid_re, text_data, required=True),
'doctype': _get_one(doctype_re, text_data, required=True),
'datetime': _get_one(datetime_re, text_data, required=True),
'headline': _get_one(headline_re, text_data, required=True),
'poster': _get_one(poster_re, _get_one(post_re, text_data)),
'postdate': _get_one(postdate_re, _get_one(post_re, text_data)),
'text': _get_text(_get_one(post_re, text_data)).strip()
}
def _get_one(regex, text, required=False):
matches = regex.search(text)
if not matches and not required:
return ''
assert len(matches.groups()) == 1, matches
return matches.groups()[0].strip()
def _get_text(data):
return tag_re.sub('', data).replace('<P>', '').replace('</P>', '')