diff --git a/spacy/tests/regression/test_issue740.py b/spacy/tests/regression/test_issue740.py new file mode 100644 index 000000000..a5a1e2cde --- /dev/null +++ b/spacy/tests/regression/test_issue740.py @@ -0,0 +1,13 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["3/4/2012", "01/12/1900"]) +def test_issue740(en_tokenizer, text): + """Test that dates are not split and kept as one token. This behaviour is currently inconsistent, since dates separated by hyphens are still split. + This will be hard to prevent without causing clashes with numeric ranges.""" + tokens = en_tokenizer(text) + assert len(tokens) == 1