spaCy/spacy/tests/hu/tokenizer/test_tokenizer.py

83 lines
3.1 KiB
Python
Raw Normal View History

2016-12-10 22:29:41 +00:00
import os
import re
import pytest
from spacy.hu import Hungarian
_MODULE_PATH = os.path.dirname(__file__)
class TokenizerTestCase(object):
INPUT_PREFIX = "IN :"
OUTPUT_PREFIX = "OUT:"
WORD_PATTERN = re.compile(r"<([wc])>([^<>]+)</\1>")
def __init__(self, input_str, expected_words):
self.input = input_str
self.expected_tokens = expected_words
def __repr__(self):
return "TokenizerTestCase<input={}, words={}>".format(repr(self.input), self.expected_tokens)
def to_tuple(self):
return (self.input, self.expected_tokens)
@classmethod
def _parse_output_line(cls, line):
for match in cls.WORD_PATTERN.finditer(line):
yield match.group(2)
@classmethod
def read_from_file(cls, path):
with open(path) as f:
input_lines = []
output_words = []
last_type = None
for line in f:
if line.startswith(cls.INPUT_PREFIX):
if last_type == TokenizerTestCase.OUTPUT_PREFIX and input_lines:
yield TokenizerTestCase("\n".join(input_lines), output_words)
input_lines = []
output_words = []
input_lines.append(line[len(cls.INPUT_PREFIX):].strip())
last_type = TokenizerTestCase.INPUT_PREFIX
elif line.startswith(cls.OUTPUT_PREFIX):
output_words.extend(list(cls._parse_output_line(line.strip())))
last_type = TokenizerTestCase.OUTPUT_PREFIX
else:
# Comments separate test cases
if input_lines:
yield TokenizerTestCase("\n".join(input_lines), output_words)
input_lines = []
output_words = []
last_type = None
_DOTS_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_dots.txt"))
_HYPHEN_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_hyphen.txt"))
_QUOTE_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_quote.txt"))
_NUMBER_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_numbers.txt"))
2016-12-20 22:49:35 +00:00
_MISC_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_misc.txt"))
_IT_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_it.txt"))
# TODO: Until this get fixed we cannot really test the urls: https://github.com/explosion/spaCy/issues/344
ALL_TESTCASES = _DOTS_CASES + _HYPHEN_CASES + _QUOTE_CASES + _NUMBER_CASES + _MISC_CASES # + _IT_CASES
2016-12-10 22:29:41 +00:00
@pytest.fixture(scope="session")
def HU():
return Hungarian()
@pytest.fixture(scope="module")
def hu_tokenizer(HU):
return HU.tokenizer
@pytest.mark.parametrize(("test_case"), ALL_TESTCASES)
def test_testcases(hu_tokenizer, test_case):
2016-12-10 22:29:41 +00:00
tokens = hu_tokenizer(test_case.input)
token_list = [token.orth_ for token in tokens if not token.is_space]
assert test_case.expected_tokens == token_list # , "{} was erronously tokenized as {}".format(test_case, token_list)