From 737816e86eda7d880baee61390c853284f1487b4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 4 Nov 2016 15:16:20 +0100 Subject: [PATCH] Fix #368: Tokenizer handled pattern 'unicode close quote, period' incorrectly. --- spacy/en/language_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/en/language_data.py b/spacy/en/language_data.py index 504103566..4666c0c6f 100644 --- a/spacy/en/language_data.py +++ b/spacy/en/language_data.py @@ -102,8 +102,8 @@ TOKENIZER_PREFIXES = r''', " ( [ { * < $ £ “ ' `` ` # US$ C$ A$ a- ‘ .... . TOKENIZER_SUFFIXES = (r''', \" \) \] \} \* \! \? % \$ > : ; ' ” '' 's 'S ’s ’S ’''' - '''\.\. \.\.\. \.\.\.\. (?<=[a-z0-9)\]"'%\)])\. ''' - '''(?<=[0-9])km''').strip().split() + r'''\.\. \.\.\. \.\.\.\. (?<=[a-z0-9)\]”"'%\)])\. ''' + r'''(?<=[0-9])km''').strip().split() TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''