mirror of https://github.com/explosion/spaCy.git
38 lines
3.5 KiB
Python
38 lines
3.5 KiB
Python
import pytest
|
|
|
|
|
|
# fmt: off
|
|
TESTCASES = [
|
|
# Punctuation tests
|
|
("āĻāĻŽāĻŋ āĻŦāĻžāĻāĻ˛āĻžāĻ¯āĻŧ āĻāĻžāĻ¨ āĻāĻžāĻ!", ["āĻāĻŽāĻŋ", "āĻŦāĻžāĻāĻ˛āĻžāĻ¯āĻŧ", "āĻāĻžāĻ¨", "āĻāĻžāĻ", "!"]),
|
|
("āĻāĻŽāĻŋ āĻŦāĻžāĻāĻ˛āĻžāĻ¯āĻŧ āĻāĻĨāĻž āĻāĻāĨ¤", ["āĻāĻŽāĻŋ", "āĻŦāĻžāĻāĻ˛āĻžāĻ¯āĻŧ", "āĻāĻĨāĻž", "āĻāĻ", "āĨ¤"]),
|
|
("āĻŦāĻ¸ā§āĻ¨ā§āĻ§āĻ°āĻž āĻāĻ¨āĻ¸āĻŽā§āĻŽā§āĻā§ āĻĻā§āĻˇ āĻ¸ā§āĻŦā§āĻāĻžāĻ° āĻāĻ°āĻ˛ā§ āĻ¨āĻž?", ["āĻŦāĻ¸ā§āĻ¨ā§āĻ§āĻ°āĻž", "āĻāĻ¨āĻ¸āĻŽā§āĻŽā§āĻā§", "āĻĻā§āĻˇ", "āĻ¸ā§āĻŦā§āĻāĻžāĻ°", "āĻāĻ°āĻ˛ā§", "āĻ¨āĻž", "?"]),
|
|
("āĻāĻžāĻāĻž āĻĨāĻžāĻāĻ˛ā§ āĻāĻŋ āĻ¨āĻž āĻšāĻ¯āĻŧ!", ["āĻāĻžāĻāĻž", "āĻĨāĻžāĻāĻ˛ā§", "āĻāĻŋ", "āĻ¨āĻž", "āĻšāĻ¯āĻŧ", "!"]),
|
|
("āĻ¸āĻ°āĻāĻžāĻ°āĻŋ āĻŦāĻŋāĻļā§āĻŦāĻŦāĻŋāĻĻā§āĻ¯āĻžāĻ˛ā§-āĻāĻ° āĻāĻžāĻ¤ā§āĻ° āĻ¨āĻ āĻŦāĻ˛ā§āĻ āĻāĻŋ āĻāĻŽāĻ¨ āĻāĻāĻ°āĻŖ?", ["āĻ¸āĻ°āĻāĻžāĻ°āĻŋ", "āĻŦāĻŋāĻļā§āĻŦāĻŦāĻŋāĻĻā§āĻ¯āĻžāĻ˛ā§", "-", "āĻāĻ°", "āĻāĻžāĻ¤ā§āĻ°", "āĻ¨āĻ", "āĻŦāĻ˛ā§āĻ", "āĻāĻŋ", "āĻāĻŽāĻ¨", "āĻāĻāĻ°āĻŖ", "?"]),
|
|
('āĻ¤āĻžāĻ°āĻž āĻŦāĻ˛ā§, "āĻāĻ°āĻž āĻāĻžāĻŽāĻžāĻ°ā§āĻ° āĻŽā§āĻ°āĻāĻŋāĨ¤"', ["āĻ¤āĻžāĻ°āĻž", "āĻŦāĻ˛ā§", ",", '"', "āĻāĻ°āĻž", "āĻāĻžāĻŽāĻžāĻ°ā§āĻ°", "āĻŽā§āĻ°āĻāĻŋ", "āĨ¤", '"']),
|
|
("ā§Š*ā§Š=ā§Ŧ?", ["ā§Š", "*", "ā§Š", "=", "ā§Ŧ", "?"]),
|
|
("āĻāĻžāĻāĻ āĻžāĻ˛-āĻāĻ° āĻāĻ¨ā§āĻ§āĻ āĻ
āĻ¨ā§āĻ¯āĻ°āĻāĻŽ", ["āĻāĻžāĻāĻ āĻžāĻ˛", "-", "āĻāĻ°", "āĻāĻ¨ā§āĻ§āĻ", "āĻ
āĻ¨ā§āĻ¯āĻ°āĻāĻŽ"]),
|
|
# Abbreviations
|
|
("āĻĄāĻ āĻāĻžāĻ˛ā§āĻĻ āĻŦāĻ˛āĻ˛ā§āĻ¨ āĻĸāĻžāĻāĻžāĻ¯āĻŧ ā§Šā§Ģ āĻĄāĻŋāĻā§āĻ°āĻŋ āĻ¸ā§.āĨ¤", ["āĻĄāĻ", "āĻāĻžāĻ˛ā§āĻĻ", "āĻŦāĻ˛āĻ˛ā§āĻ¨", "āĻĸāĻžāĻāĻžāĻ¯āĻŧ", "ā§Šā§Ģ", "āĻĄāĻŋāĻā§āĻ°āĻŋ", "āĻ¸ā§.", "āĨ¤"]),
|
|
]
|
|
# fmt: on
|
|
|
|
|
|
@pytest.mark.parametrize("text,expected_tokens", TESTCASES)
|
|
def test_bn_tokenizer_handles_testcases(bn_tokenizer, text, expected_tokens):
|
|
tokens = bn_tokenizer(text)
|
|
token_list = [token.text for token in tokens if not token.is_space]
|
|
assert expected_tokens == token_list
|
|
|
|
|
|
def test_bn_tokenizer_handles_long_text(bn_tokenizer):
|
|
text = """āĻ¨āĻ°ā§āĻĨ āĻ¸āĻžāĻāĻĨ āĻŦāĻŋāĻļā§āĻŦāĻŦāĻŋāĻĻā§āĻ¯āĻžāĻ˛āĻ¯āĻŧā§ āĻ¸āĻžāĻ°āĻžāĻŦāĻāĻ° āĻā§āĻ¨ āĻ¨āĻž āĻā§āĻ¨ āĻŦāĻŋāĻˇāĻ¯āĻŧā§ āĻāĻŦā§āĻˇāĻŖāĻž āĻāĻ˛āĻ¤ā§āĻ āĻĨāĻžāĻā§āĨ¤ \
|
|
āĻ
āĻāĻŋāĻā§āĻ āĻĢā§āĻ¯āĻžāĻāĻžāĻ˛ā§āĻāĻŋ āĻŽā§āĻŽā§āĻŦāĻžāĻ°āĻāĻŖ āĻĒā§āĻ°āĻžāĻ¯āĻŧāĻ āĻļāĻŋāĻā§āĻˇāĻžāĻ°ā§āĻĨā§āĻĻā§āĻ° āĻ¨āĻŋāĻ¯āĻŧā§ āĻŦāĻŋāĻāĻŋāĻ¨ā§āĻ¨ āĻāĻŦā§āĻˇāĻŖāĻž āĻĒā§āĻ°āĻāĻ˛ā§āĻĒā§ āĻāĻžāĻ āĻāĻ°ā§āĻ¨, \
|
|
āĻ¯āĻžāĻ° āĻŽāĻ§ā§āĻ¯ā§ āĻ°āĻ¯āĻŧā§āĻā§ āĻ°ā§āĻŦāĻ āĻĨā§āĻā§ āĻŽā§āĻļāĻŋāĻ¨ āĻ˛āĻžāĻ°ā§āĻ¨āĻŋāĻ āĻ¸āĻŋāĻ¸ā§āĻā§āĻŽ āĻ āĻāĻ°ā§āĻāĻŋāĻĢāĻŋāĻļāĻŋāĻ¯āĻŧāĻžāĻ˛ āĻāĻ¨ā§āĻā§āĻ˛āĻŋāĻā§āĻ¨ā§āĻ¸āĨ¤ \
|
|
āĻāĻ¸āĻāĻ˛ āĻĒā§āĻ°āĻāĻ˛ā§āĻĒā§ āĻāĻžāĻ āĻāĻ°āĻžāĻ° āĻŽāĻžāĻ§ā§āĻ¯āĻŽā§ āĻ¸āĻāĻļā§āĻ˛āĻŋāĻˇā§āĻ āĻā§āĻˇā§āĻ¤ā§āĻ°ā§ āĻ¯āĻĨā§āĻˇā§āĻ āĻĒāĻ°āĻŋāĻŽāĻžāĻŖ āĻ¸ā§āĻĒā§āĻļāĻžāĻ˛āĻžāĻāĻāĻĄ āĻšāĻāĻ¯āĻŧāĻž āĻ¸āĻŽā§āĻāĻŦāĨ¤ \
|
|
āĻāĻ° āĻāĻŦā§āĻˇāĻŖāĻžāĻ° āĻāĻžāĻ āĻ¤ā§āĻŽāĻžāĻ° āĻā§āĻ¯āĻžāĻ°āĻŋāĻ¯āĻŧāĻžāĻ°āĻā§ āĻ ā§āĻ˛ā§ āĻ¨āĻŋāĻ¯āĻŧā§ āĻ¯āĻžāĻŦā§ āĻ
āĻ¨ā§āĻāĻāĻžāĻ¨āĻŋ! \
|
|
āĻāĻ¨ā§āĻā§āĻ¸ā§āĻ āĻĒā§āĻ°ā§āĻā§āĻ°āĻžāĻŽāĻžāĻ° āĻšāĻ, āĻāĻŦā§āĻˇāĻ āĻāĻŋāĻāĻŦāĻž āĻĄā§āĻā§āĻ˛āĻĒāĻžāĻ° - āĻ¨āĻ°ā§āĻĨ āĻ¸āĻžāĻāĻĨ āĻāĻāĻ¨āĻŋāĻāĻžāĻ°ā§āĻ¸āĻŋāĻāĻŋāĻ¤ā§ āĻ¤ā§āĻŽāĻžāĻ° āĻĒā§āĻ°āĻ¤āĻŋāĻāĻž āĻŦāĻŋāĻāĻžāĻļā§āĻ° āĻ¸ā§āĻ¯ā§āĻ āĻ°āĻ¯āĻŧā§āĻā§āĻāĨ¤ \
|
|
āĻ¨āĻ°ā§āĻĨ āĻ¸āĻžāĻāĻĨā§āĻ° āĻ
āĻ¸āĻžāĻ§āĻžāĻ°āĻŖ āĻāĻŽāĻŋāĻāĻ¨āĻŋāĻāĻŋāĻ¤ā§ āĻ¤ā§āĻŽāĻžāĻā§ āĻ¸āĻžāĻĻāĻ° āĻāĻŽāĻ¨ā§āĻ¤ā§āĻ°āĻŖāĨ¤"""
|
|
tokens = bn_tokenizer(text)
|
|
assert len(tokens) == 84
|