spaCy/spacy/tests/lang/pl/test_tokenizer.py

61 lines
1.6 KiB
Python
Raw Normal View History

# coding: utf8
from __future__ import unicode_literals
import pytest
DOT_TESTS = [
('tel.', ['tel.']),
('np.', ['np.']),
('godz. 21:37', ['godz.', '21:37']),
('inż.', ['inż.']),
('gosp.-polit.', ['gosp.-polit.']),
('ppoż', ['ppoż']),
('płn', ['płn']),
('ul.', ['ul.']),
('jw.', ['jw.']),
('itd.', ['itd.']),
('cdn.', ['cdn.']),
('itp.', ['itp.']),
('10,- zł', ['10,-', '']),
('0 zł 99 gr', ['0', '', '99', 'gr']),
('0,99 rub.', ['0,99', 'rub.']),
('dol.', ['dol.']),
('1000 m n.p.m.', ['1000', 'm', 'n.p.m.']),
('m.in.', ['m.in.']),
('p.n.e.', ['p.n.e.']),
('Sz.P.', ['Sz.P.']),
('p.o.', ['p.o.']),
('k.o.', ['k.o.']),
('m.st.', ['m.st.']),
('dra.', ['dra', '.']),
('pp.', ['pp.']),
('oo.', ['oo.'])
]
HYPHEN_TESTS = [
('5-fluoropentylo-3-pirydynyloindol', ['5-fluoropentylo-3-pirydynyloindol']),
('NESS-040C5', ['NESS-040C5']),
('JTE-7-31', ['JTE-7-31']),
('BAY-59-3074', ['BAY-59-3074']),
('BAY-38-7271', ['BAY-38-7271']),
('STS-135', ['STS-135']),
('5F-PB-22', ['5F-PB-22']),
('cztero-', ['cztero-']),
('jedno-', ['jedno-']),
('dwu-', ['dwu-']),
('trzy-', ['trzy-']),
('b-adoratorzy', ['b-adoratorzy']),
('2-3-4 drzewa', ['2-3-4', 'drzewa']),
('b-drzewa', ['b-drzewa'])
]
TESTCASES = DOT_TESTS + HYPHEN_TESTS
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
def test_tokenizer_handles_testcases(pl_tokenizer, text, expected_tokens):
tokens = pl_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list