spaCy/spacy/tests/tokenizer/test_contractions.py

74 lines
2.5 KiB
Python
Raw Normal View History

from __future__ import unicode_literals
from ...en import English
2014-12-21 09:41:13 +00:00
import pytest
@pytest.fixture
def en_tokenizer():
return English.Defaults.create_tokenizer()
@pytest.mark.parametrize('inputs', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
def test_tokenizer_handles_poss_contraction(en_tokenizer, inputs):
text_poss, text = inputs
tokens = en_tokenizer(text_poss)
assert len(tokens) == 2
assert tokens[0].text == text
assert tokens[1].text == "'s"
@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
def test_tokenizer_splits_trailing_apos(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == text.split("'")[0]
assert tokens[1].text == "'"
@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].text == text
@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
def test_tokenizer_handles_ll_contraction(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == text.split("'")[0]
assert tokens[1].text == "'ll"
assert tokens[1].lemma_ == "will"
@pytest.mark.parametrize('inputs', [("can't", "Can't"), ("ain't", "Ain't")])
def test_tokenizer_handles_capitalization(en_tokenizer, inputs):
text_lower, text_title = inputs
tokens_lower = en_tokenizer(text_lower)
tokens_title = en_tokenizer(text_title)
assert tokens_title[0].text == tokens_lower[0].text.title()
assert tokens_lower[0].text == tokens_title[0].text.lower()
assert tokens_lower[1].text == tokens_title[1].text
2014-12-07 11:08:04 +00:00
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
def test_tokenizer_keeps_title_case(en_tokenizer, pron):
for contraction in ["'ll", "'d"]:
tokens = en_tokenizer(pron + contraction)
assert tokens[0].text == pron
assert tokens[1].text == contraction
2014-12-07 11:08:04 +00:00
@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
def test_tokenizer_excludes_ambiguous(en_tokenizer, exc):
tokens = en_tokenizer(exc)
assert len(tokens) == 1
@pytest.mark.parametrize('inputs', [("We've", "``We've"), ("couldn't", "couldn't)")])
def test_tokenizer_splits_defined_punct(en_tokenizer, inputs):
wo_punct, w_punct = inputs
tokens = en_tokenizer(wo_punct)
assert len(tokens) == 2
tokens = en_tokenizer(w_punct)
assert len(tokens) == 3