From 41a4766c1c4b86f07a8cd2a038b2848329b1ae5e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 2 Nov 2016 20:43:12 +0100 Subject: [PATCH] Fix infixes in spanish and portuguese --- spacy/es/language_data.py | 5 ++++- spacy/pt/language_data.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/spacy/es/language_data.py b/spacy/es/language_data.py index f43bc3681..417cd9828 100644 --- a/spacy/es/language_data.py +++ b/spacy/es/language_data.py @@ -114,7 +114,10 @@ _ '''.strip().split('\n') -TOKENIZER_INFIXES = tuple() +TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) ''' + r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) ''' + r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split() + TOKENIZER_EXCEPTIONS = { diff --git a/spacy/pt/language_data.py b/spacy/pt/language_data.py index f43bc3681..417cd9828 100644 --- a/spacy/pt/language_data.py +++ b/spacy/pt/language_data.py @@ -114,7 +114,10 @@ _ '''.strip().split('\n') -TOKENIZER_INFIXES = tuple() +TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) ''' + r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) ''' + r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split() + TOKENIZER_EXCEPTIONS = {