From 8f24dc198262fe57f5eb435fe4b8cb425d38b5da Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 2 Nov 2016 20:43:52 +0100 Subject: [PATCH] Fix infixes in Italian --- spacy/it/language_data.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/it/language_data.py b/spacy/it/language_data.py index f43bc3681..417cd9828 100644 --- a/spacy/it/language_data.py +++ b/spacy/it/language_data.py @@ -114,7 +114,10 @@ _ '''.strip().split('\n') -TOKENIZER_INFIXES = tuple() +TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) ''' + r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) ''' + r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split() + TOKENIZER_EXCEPTIONS = {