* Add xfail test for Issue #225: tokenization with non-whitespace delimiters

This commit is contained in:
Matthew Honnibal 2016-01-19 13:20:14 +01:00
parent 7abe653223
commit 515493c675
1 changed files with 17 additions and 6 deletions

View File

@ -116,12 +116,13 @@ def test_cnts5(en_tokenizer):
tokens = en_tokenizer(text)
assert len(tokens) == 11
# TODO: This is currently difficult --- infix interferes here.
#def test_mr(en_tokenizer):
# text = """Today is Tuesday.Mr."""
# tokens = en_tokenizer(text)
# assert len(tokens) == 5
# assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
@pytest.mark.xfail
def test_mr(en_tokenizer):
text = """Today is Tuesday.Mr."""
tokens = en_tokenizer(text)
assert len(tokens) == 5
assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
def test_cnts6(en_tokenizer):
@ -148,6 +149,16 @@ def test_two_whitespace(en_tokenizer):
tokens = en_tokenizer(orig_str)
assert repr(tokens.text_with_ws) == repr(orig_str)
@pytest.mark.xfail
def test_em_dash_infix(en_tokenizer):
# Re Issue #225
tokens = en_tokenizer('''Will this road take me to Puddleton?\u2014No, '''
'''you'll have to walk there.\u2014Ariel.''')
assert tokens[6].text == 'Puddleton'
assert tokens[7].text == '?'
assert tokens[8].text == '\u2014'
#def test_cnts7():
# text = 'But then the 6,000-year ice age came...'
# tokens = EN.tokenize(text)