From 6f8206576113d51914132ab8f40b005eae423628 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 14 Apr 2016 11:36:03 +0200 Subject: [PATCH] * Fix infixed commas in tokenizer, re Issue #326. Need to benchmark on empirical data, to make sure this doesn't break other cases. --- lang_data/en/infix.txt | 1 + spacy/tests/tokenizer/test_infix.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/lang_data/en/infix.txt b/lang_data/en/infix.txt index 8c83b7d4c..aa36da8e9 100644 --- a/lang_data/en/infix.txt +++ b/lang_data/en/infix.txt @@ -3,3 +3,4 @@ (?<=[a-zA-Z])-(?=[a-zA-z]) (?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) +(?<=[A-Za-z]),(?=[A-Za-z]) diff --git a/spacy/tests/tokenizer/test_infix.py b/spacy/tests/tokenizer/test_infix.py index 4edc031d7..351394021 100644 --- a/spacy/tests/tokenizer/test_infix.py +++ b/spacy/tests/tokenizer/test_infix.py @@ -47,3 +47,10 @@ def test_double_hyphen(en_tokenizer): assert tokens[8].text == u'--' assert tokens[9].text == u'people' + +def test_infix_comma(en_tokenizer): + # Re issue #326 + tokens = en_tokenizer(u'Hello,world') + assert tokens[0].text == u'Hello' + assert tokens[1].text == u',' + assert tokens[2].text == u'world'