diff --git a/spacy/es/language_data.py b/spacy/es/language_data.py index f43bc3681..417cd9828 100644 --- a/spacy/es/language_data.py +++ b/spacy/es/language_data.py @@ -114,7 +114,10 @@ _ '''.strip().split('\n') -TOKENIZER_INFIXES = tuple() +TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) ''' + r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) ''' + r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split() + TOKENIZER_EXCEPTIONS = { diff --git a/spacy/pt/language_data.py b/spacy/pt/language_data.py index f43bc3681..417cd9828 100644 --- a/spacy/pt/language_data.py +++ b/spacy/pt/language_data.py @@ -114,7 +114,10 @@ _ '''.strip().split('\n') -TOKENIZER_INFIXES = tuple() +TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) ''' + r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) ''' + r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split() + TOKENIZER_EXCEPTIONS = {