From 515493c67543365feddb733122f637c902c9a17e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 19 Jan 2016 13:20:14 +0100 Subject: [PATCH] * Add xfail test for Issue #225: tokenization with non-whitespace delimiters --- spacy/tests/tokenizer/test_tokenizer.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index c900860c4..58c617240 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -116,12 +116,13 @@ def test_cnts5(en_tokenizer): tokens = en_tokenizer(text) assert len(tokens) == 11 -# TODO: This is currently difficult --- infix interferes here. -#def test_mr(en_tokenizer): -# text = """Today is Tuesday.Mr.""" -# tokens = en_tokenizer(text) -# assert len(tokens) == 5 -# assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.'] + +@pytest.mark.xfail +def test_mr(en_tokenizer): + text = """Today is Tuesday.Mr.""" + tokens = en_tokenizer(text) + assert len(tokens) == 5 + assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.'] def test_cnts6(en_tokenizer): @@ -148,6 +149,16 @@ def test_two_whitespace(en_tokenizer): tokens = en_tokenizer(orig_str) assert repr(tokens.text_with_ws) == repr(orig_str) + +@pytest.mark.xfail +def test_em_dash_infix(en_tokenizer): + # Re Issue #225 + tokens = en_tokenizer('''Will this road take me to Puddleton?\u2014No, ''' + '''you'll have to walk there.\u2014Ariel.''') + assert tokens[6].text == 'Puddleton' + assert tokens[7].text == '?' + assert tokens[8].text == '\u2014' + #def test_cnts7(): # text = 'But then the 6,000-year ice age came...' # tokens = EN.tokenize(text)