From 417d45f5d062078e1895f4521e868c5bece91a54 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 11 Oct 2017 02:24:58 +0200 Subject: [PATCH] Add lemmatizer data as variable on language data Don't create lookup lemmatizer within Language class and just pass in the data so it can be set on Token creation --- spacy/lang/de/__init__.py | 6 +----- spacy/lang/en/__init__.py | 3 ++- spacy/lang/es/__init__.py | 6 +----- spacy/lang/fr/__init__.py | 6 +----- spacy/lang/hu/__init__.py | 6 +----- spacy/lang/id/__init__.py | 6 +----- spacy/lang/it/__init__.py | 6 +----- spacy/lang/pt/__init__.py | 6 +----- spacy/lang/sv/__init__.py | 7 ++----- 9 files changed, 11 insertions(+), 41 deletions(-) diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index 0ff707a06..e56bab844 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -12,7 +12,6 @@ from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -27,10 +26,7 @@ class GermanDefaults(Language.Defaults): tag_map = dict(TAG_MAP) stop_words = set(STOP_WORDS) syntax_iterators = dict(SYNTAX_ITERATORS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + lemma_lookup = dict(LOOKUP) class German(Language): diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 79d383b90..fffac6467 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -7,7 +7,7 @@ from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .morph_rules import MORPH_RULES -from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC +from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC, LOOKUP from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS @@ -30,6 +30,7 @@ class EnglishDefaults(Language.Defaults): lemma_rules = dict(LEMMA_RULES) lemma_index = dict(LEMMA_INDEX) lemma_exc = dict(LEMMA_EXC) + lemma_lookup = dict(LOOKUP) syntax_iterators = dict(SYNTAX_ITERATORS) diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index e64b88fad..4246a0703 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -10,7 +10,6 @@ from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -23,10 +22,7 @@ class SpanishDefaults(Language.Defaults): tag_map = dict(TAG_MAP) stop_words = set(STOP_WORDS) sytax_iterators = dict(SYNTAX_ITERATORS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + lemma_lookup = dict(LOOKUP) class Spanish(Language): diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index e2123c28f..0f2a60e3e 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -27,10 +26,7 @@ class FrenchDefaults(Language.Defaults): suffixes = tuple(TOKENIZER_SUFFIXES) token_match = TOKEN_MATCH syntax_iterators = dict(SYNTAX_ITERATORS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + lemma_lookup = dict(LOOKUP) class French(Language): diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py index 9b6b63a81..fd039a8eb 100644 --- a/spacy/lang/hu/__init__.py +++ b/spacy/lang/hu/__init__.py @@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -24,10 +23,7 @@ class HungarianDefaults(Language.Defaults): suffixes = tuple(TOKENIZER_SUFFIXES) infixes = tuple(TOKENIZER_INFIXES) token_match = TOKEN_MATCH - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + lemma_lookup = dict(LOOKUP) class Hungarian(Language): diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index b4d020427..29fe86a01 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG from ...util import update_exc @@ -26,10 +25,7 @@ class IndonesianDefaults(Language.Defaults): suffixes = tuple(TOKENIZER_SUFFIXES) infixes = tuple(TOKENIZER_INFIXES) syntax_iterators = dict(SYNTAX_ITERATORS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + lemma_lookup = dict(LOOKUP) class Indonesian(Language): diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index f6506038c..c19cb6d39 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -7,7 +7,6 @@ from .lemmatizer import LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -18,10 +17,7 @@ class ItalianDefaults(Language.Defaults): lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) stop_words = set(STOP_WORDS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + lemma_lookup = dict(LOOKUP) class Italian(Language): diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index 0baae7e7a..6366a25c1 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -21,10 +20,7 @@ class PortugueseDefaults(Language.Defaults): lex_attr_getters.update(LEX_ATTRS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + lemma_lookup = dict(LOOKUP) class Portuguese(Language): diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index b21333fac..27da9024e 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -9,7 +9,6 @@ from .lemmatizer import LEMMA_RULES, LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -20,10 +19,8 @@ class SwedishDefaults(Language.Defaults): lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = set(STOP_WORDS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + lemma_rules = dict(LEMMA_RULES) + lemma_lookup = dict(LOOKUP) class Swedish(Language):