2017-09-11 07:57:48 +00:00
|
|
|
# coding: utf8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
|
2017-09-11 08:33:17 +00:00
|
|
|
GA_TOKEN_EXCEPTION_TESTS = [
|
2017-10-31 20:24:53 +00:00
|
|
|
('Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).', ['Niall', 'O', 'Domhnaill', ',', 'Rialtas', 'na', 'hÉireann', '1977', '('. 'lch.', '600', ')', '.']),
|
2017-09-11 07:57:48 +00:00
|
|
|
('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise'])
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize('text,expected_tokens', GA_TOKEN_EXCEPTION_TESTS)
|
|
|
|
def test_tokenizer_handles_exception_cases(ga_tokenizer, text, expected_tokens):
|
|
|
|
tokens = ga_tokenizer(text)
|
|
|
|
token_list = [token.text for token in tokens if not token.is_space]
|
|
|
|
assert expected_tokens == token_list
|
|
|
|
|