From 19c1e83d3de979495fe12d3034fe4853a181039c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 2 Nov 2016 19:56:32 +0100 Subject: [PATCH] Work on draft Italian tokenizer --- spacy/it/__init__.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/spacy/it/__init__.py b/spacy/it/__init__.py index 6a824fe82..b5e5487dd 100644 --- a/spacy/it/__init__.py +++ b/spacy/it/__init__.py @@ -3,7 +3,25 @@ from __future__ import unicode_literals, print_function from os import path from ..language import Language +from ..attrs import LANG +from . import language_data -class Italian(Language): - pass +class German(Language): + lang = 'it' + + class Defaults(Language.Defaults): + tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS) + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'it' + + prefixes = tuple(language_data.TOKENIZER_PREFIXES) + + suffixes = tuple(language_data.TOKENIZER_SUFFIXES) + + infixes = tuple(language_data.TOKENIZER_INFIXES) + + tag_map = dict(language_data.TAG_MAP) + + stop_words = set(language_data.STOP_WORDS) +