mirror of https://github.com/explosion/spaCy.git
* Add xfail test for Issue #225: tokenization with non-whitespace delimiters
This commit is contained in:
parent
7abe653223
commit
515493c675
|
@ -116,12 +116,13 @@ def test_cnts5(en_tokenizer):
|
|||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 11
|
||||
|
||||
# TODO: This is currently difficult --- infix interferes here.
|
||||
#def test_mr(en_tokenizer):
|
||||
# text = """Today is Tuesday.Mr."""
|
||||
# tokens = en_tokenizer(text)
|
||||
# assert len(tokens) == 5
|
||||
# assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_mr(en_tokenizer):
|
||||
text = """Today is Tuesday.Mr."""
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 5
|
||||
assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
|
||||
|
||||
|
||||
def test_cnts6(en_tokenizer):
|
||||
|
@ -148,6 +149,16 @@ def test_two_whitespace(en_tokenizer):
|
|||
tokens = en_tokenizer(orig_str)
|
||||
assert repr(tokens.text_with_ws) == repr(orig_str)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_em_dash_infix(en_tokenizer):
|
||||
# Re Issue #225
|
||||
tokens = en_tokenizer('''Will this road take me to Puddleton?\u2014No, '''
|
||||
'''you'll have to walk there.\u2014Ariel.''')
|
||||
assert tokens[6].text == 'Puddleton'
|
||||
assert tokens[7].text == '?'
|
||||
assert tokens[8].text == '\u2014'
|
||||
|
||||
#def test_cnts7():
|
||||
# text = 'But then the 6,000-year ice age came...'
|
||||
# tokens = EN.tokenize(text)
|
||||
|
|
Loading…
Reference in New Issue