From 744f06abf541a5df8a1dd6ea0eaeb22c9282ef74 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sun, 24 May 2015 21:49:58 +0200
Subject: [PATCH] * Add script to read OntoNotes source documents

---
 spacy/munge/read_ontonotes.py | 47 +++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 spacy/munge/read_ontonotes.py
diff --git a/spacy/munge/read_ontonotes.py b/spacy/munge/read_ontonotes.py
new file mode 100644
index 000000000..38c3c780e
--- /dev/null
+++ b/spacy/munge/read_ontonotes.py
@@ -0,0 +1,47 @@
+import re
+
+
+docid_re = re.compile(r'<DOCID>([^>]+)</DOCID>')
+doctype_re = re.compile(r'<DOCTYPE SOURCE="[^"]+">([^>]+)</DOCTYPE>')
+datetime_re = re.compile(r'<DATETIME>([^>]+)</DATETIME>')
+headline_re = re.compile(r'<HEADLINE>(.+)</HEADLINE>', re.DOTALL)
+post_re = re.compile(r'<POST>(.+)</POST>', re.DOTALL)
+poster_re = re.compile(r'<POSTER>(.+)</POSTER>')
+postdate_re = re.compile(r'<POSTDATE>(.+)</POSTDATE>')
+tag_re = re.compile(r'<[^>]+>[^>]+</[^>]+>')
+
+
+def sgml_extract(text_data):
+    """Extract text from the OntoNotes web documents.
+
+    Format:
+    [{
+        docid: string,
+        doctype: string,
+        datetime: string,
+        poster: string,
+        postdate: string
+        text: [string]
+    }]
+    """
+    return {
+        'docid': _get_one(docid_re, text_data, required=True),
+        'doctype': _get_one(doctype_re, text_data, required=True),
+        'datetime': _get_one(datetime_re, text_data, required=True),
+        'headline': _get_one(headline_re, text_data, required=True),
+        'poster': _get_one(poster_re, _get_one(post_re, text_data)),
+        'postdate': _get_one(postdate_re, _get_one(post_re, text_data)),
+        'text': _get_text(_get_one(post_re, text_data)).strip()
+    }
+
+
+def _get_one(regex, text, required=False):
+    matches = regex.search(text)
+    if not matches and not required:
+        return ''
+    assert len(matches.groups()) == 1, matches
+    return matches.groups()[0].strip()
+
+
+def _get_text(data):
+    return tag_re.sub('', data).replace('<P>', '').replace('</P>', '')