Additional abbreviation tests.

This commit is contained in:
Gyorgy Orosz 2016-12-08 12:17:44 +01:00
parent 90d22db023
commit 0289b8ceaa
1 changed files with 12 additions and 6 deletions

View File

@ -13,9 +13,15 @@ def hu_tokenizer(HU):
return HU.tokenizer return HU.tokenizer
def test_abbreviations(hu_tokenizer): @pytest.mark.parametrize(("input_str", "expected_length"), [
tokens = hu_tokenizer("A vs. egy") ("A vs. egy", 3),
assert len(tokens) == 3 ("A dr. egy", 3),
("A .hu egy tld.", 5),
tokens = hu_tokenizer("A dr. egy") ("A .hu.", 3),
assert len(tokens) == 3 ("Az egy.ketto pelda.", 4),
("A pl. rovidites.", 4),
("A S.M.A.R.T. szo.", 4)
])
def test_abbreviations(hu_tokenizer, input_str, expected_length):
tokens = hu_tokenizer(input_str)
assert len(tokens) == expected_length