mirror of https://github.com/explosion/spaCy.git
32 lines
1.1 KiB
Python
32 lines
1.1 KiB
Python
|
import pytest
|
||
|
import os
|
||
|
from os import path
|
||
|
|
||
|
from spacy.munge.read_ontonotes import sgml_extract
|
||
|
|
||
|
|
||
|
text_data = open(path.join(path.dirname(__file__), 'web_sample1.sgm')).read()
|
||
|
|
||
|
|
||
|
def test_example_extract():
|
||
|
article = sgml_extract(text_data)
|
||
|
assert article['docid'] == 'blogspot.com_alaindewitt_20060924104100_ENG_20060924_104100'
|
||
|
assert article['doctype'] == 'BLOG TEXT'
|
||
|
assert article['datetime'] == '2006-09-24T10:41:00'
|
||
|
assert article['headline'].strip() == 'Devastating Critique of the Arab World by One of Its Own'
|
||
|
assert article['poster'] == 'Alain DeWitt'
|
||
|
assert article['postdate'] == '2006-09-24T10:41:00'
|
||
|
assert article['text'].startswith('Thanks again to my fri'), article['text'][:10]
|
||
|
assert article['text'].endswith(' tide will turn."'), article['text'][-10:]
|
||
|
assert '<' not in article['text'], article['text'][:10]
|
||
|
|
||
|
|
||
|
def test_directory():
|
||
|
context_dir = '/usr/local/data/OntoNotes5/data/english/metadata/context/wb/sel'
|
||
|
|
||
|
for fn in os.listdir(context_dir):
|
||
|
with open(path.join(context_dir, fn)) as file_:
|
||
|
text = file_.read()
|
||
|
article = sgml_extract(text)
|
||
|
|