* Add note about failed tokenization

This commit is contained in:
Matthew Honnibal 2015-06-08 16:17:07 +02:00
parent c7e3dfc1dc
commit bd4f5f89cb
1 changed files with 6 additions and 4 deletions

View File

@ -103,10 +103,12 @@ def test_cnts5(en_tokenizer):
tokens = en_tokenizer(text)
assert len(tokens) == 11
def test_mr(en_tokenizer):
text = """Mr. Smith"""
tokens = en_tokenizer(text)
assert len(tokens) == 2
# TODO: This is currently difficult --- infix interferes here.
#def test_mr(en_tokenizer):
# text = """Today is Tuesday.Mr."""
# tokens = en_tokenizer(text)
# assert len(tokens) == 5
# assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
def test_cnts6(en_tokenizer):