From 515493c67543365feddb733122f637c902c9a17e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 19 Jan 2016 13:20:14 +0100
Subject: [PATCH] * Add xfail test for Issue #225: tokenization with
 non-whitespace delimiters

---
 spacy/tests/tokenizer/test_tokenizer.py | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index c900860c4..58c617240 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -116,12 +116,13 @@ def test_cnts5(en_tokenizer):
     tokens = en_tokenizer(text)
     assert len(tokens) == 11
 
-# TODO: This is currently difficult --- infix interferes here.
-#def test_mr(en_tokenizer):
-#    text = """Today is Tuesday.Mr."""
-#    tokens = en_tokenizer(text)
-#    assert len(tokens) == 5
-#    assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
+
+@pytest.mark.xfail
+def test_mr(en_tokenizer):
+    text = """Today is Tuesday.Mr."""
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 5
+    assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
 
 
 def test_cnts6(en_tokenizer):
@@ -148,6 +149,16 @@ def test_two_whitespace(en_tokenizer):
     tokens = en_tokenizer(orig_str)
     assert repr(tokens.text_with_ws) == repr(orig_str)
 
+
+@pytest.mark.xfail
+def test_em_dash_infix(en_tokenizer):
+    # Re Issue #225
+    tokens = en_tokenizer('''Will this road take me to Puddleton?\u2014No, '''
+                          '''you'll have to walk there.\u2014Ariel.''')
+    assert tokens[6].text == 'Puddleton'
+    assert tokens[7].text == '?'
+    assert tokens[8].text == '\u2014'
+
 #def test_cnts7():
 #    text = 'But then the 6,000-year ice age came...'
 #    tokens = EN.tokenize(text)