diff --git a/spacy/munge/read_ontonotes.py b/spacy/munge/read_ontonotes.py
new file mode 100644
index 000000000..38c3c780e
--- /dev/null
+++ b/spacy/munge/read_ontonotes.py
@@ -0,0 +1,47 @@
+import re
+
+
+docid_re = re.compile(r'([^>]+)')
+doctype_re = re.compile(r'([^>]+)')
+datetime_re = re.compile(r'([^>]+)')
+headline_re = re.compile(r'(.+)', re.DOTALL)
+post_re = re.compile(r'(.+)', re.DOTALL)
+poster_re = re.compile(r'(.+)')
+postdate_re = re.compile(r'(.+)')
+tag_re = re.compile(r'<[^>]+>[^>]+[^>]+>')
+
+
+def sgml_extract(text_data):
+ """Extract text from the OntoNotes web documents.
+
+ Format:
+ [{
+ docid: string,
+ doctype: string,
+ datetime: string,
+ poster: string,
+ postdate: string
+ text: [string]
+ }]
+ """
+ return {
+ 'docid': _get_one(docid_re, text_data, required=True),
+ 'doctype': _get_one(doctype_re, text_data, required=True),
+ 'datetime': _get_one(datetime_re, text_data, required=True),
+ 'headline': _get_one(headline_re, text_data, required=True),
+ 'poster': _get_one(poster_re, _get_one(post_re, text_data)),
+ 'postdate': _get_one(postdate_re, _get_one(post_re, text_data)),
+ 'text': _get_text(_get_one(post_re, text_data)).strip()
+ }
+
+
+def _get_one(regex, text, required=False):
+ matches = regex.search(text)
+ if not matches and not required:
+ return ''
+ assert len(matches.groups()) == 1, matches
+ return matches.groups()[0].strip()
+
+
+def _get_text(data):
+ return tag_re.sub('', data).replace('
', '').replace('
', '')