From 454c1996d0773d822fe00fbbcdeeabe4ae41a24c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 17 Oct 2015 15:49:51 +1100 Subject: [PATCH] * Add tokenizer rule to fix numeric range tokenization --- lang_data/en/infix.txt | 1 + tests/tokenizer/test_infix.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/lang_data/en/infix.txt b/lang_data/en/infix.txt index 37eca7350..cb9cc0a78 100644 --- a/lang_data/en/infix.txt +++ b/lang_data/en/infix.txt @@ -1,3 +1,4 @@ \.\.\. (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) +(?<=[0-9])-(?=[0-9]) diff --git a/tests/tokenizer/test_infix.py b/tests/tokenizer/test_infix.py index f274e3dd7..d703682cf 100644 --- a/tests/tokenizer/test_infix.py +++ b/tests/tokenizer/test_infix.py @@ -7,6 +7,10 @@ def test_hyphen(en_tokenizer): assert len(tokens) == 3 +def test_numeric_range(en_tokenizer): + tokens = en_tokenizer('0.1-13.5') + assert len(tokens) == 3 + def test_period(en_tokenizer): tokens = en_tokenizer('best.Known') assert len(tokens) == 3