From a9c70c94472e623e804ca4b805134185cdc7f8fc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 24 May 2015 21:52:12 +0200 Subject: [PATCH] * Add tests for ontonotes sgml extraction --- tests/test_onto_sgml_extract.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 tests/test_onto_sgml_extract.py diff --git a/tests/test_onto_sgml_extract.py b/tests/test_onto_sgml_extract.py new file mode 100644 index 000000000..52870d4ea --- /dev/null +++ b/tests/test_onto_sgml_extract.py @@ -0,0 +1,31 @@ +import pytest +import os +from os import path + +from spacy.munge.read_ontonotes import sgml_extract + + +text_data = open(path.join(path.dirname(__file__), 'web_sample1.sgm')).read() + + +def test_example_extract(): + article = sgml_extract(text_data) + assert article['docid'] == 'blogspot.com_alaindewitt_20060924104100_ENG_20060924_104100' + assert article['doctype'] == 'BLOG TEXT' + assert article['datetime'] == '2006-09-24T10:41:00' + assert article['headline'].strip() == 'Devastating Critique of the Arab World by One of Its Own' + assert article['poster'] == 'Alain DeWitt' + assert article['postdate'] == '2006-09-24T10:41:00' + assert article['text'].startswith('Thanks again to my fri'), article['text'][:10] + assert article['text'].endswith(' tide will turn."'), article['text'][-10:] + assert '<' not in article['text'], article['text'][:10] + + +def test_directory(): + context_dir = '/usr/local/data/OntoNotes5/data/english/metadata/context/wb/sel' + + for fn in os.listdir(context_dir): + with open(path.join(context_dir, fn)) as file_: + text = file_.read() + article = sgml_extract(text) +