diff --git a/spacy/de/__init__.py b/spacy/de/__init__.py index 328c39804..be5b3b0f0 100644 --- a/spacy/de/__init__.py +++ b/spacy/de/__init__.py @@ -3,22 +3,22 @@ from __future__ import unicode_literals, print_function from os import path from ..language import Language -from ..vocab import Vocab -from ..attrs import LANG +from . import language_data class German(Language): lang = 'de' class Defaults(Language.Defaults): - def Vocab(self, vectors=None, lex_attr_getters=None): - if lex_attr_getters is None: - lex_attr_getters = dict(self.lex_attr_getters) - if vectors is None: - vectors = self.Vectors() - # set a dummy lemmatizer for now that simply returns the same string - # until the morphology is done for German - return Vocab.load(self.path, lex_attr_getters=lex_attr_getters, vectors=vectors, - lemmatizer=False) + tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS) + + prefixes = tuple(language_data.TOKENIZER_PREFIXES) + + suffixes = tuple(language_data.TOKENIZER_SUFFIXES) + + infixes = tuple(language_data.TOKENIZER_INFIXES) + + tag_map = dict(language_data.TAG_MAP) + + stop_words = set(language_data.STOP_WORDS) - stop_words = set() diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index ec8d1144f..e05787f12 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -14,35 +14,6 @@ class English(Language): lang = 'en' class Defaults(Language.Defaults): - def Vocab(self, lex_attr_getters=True, tag_map=True, - lemmatizer=True, serializer_freqs=True, vectors=True): - if lex_attr_getters is True: - lex_attr_getters = self.lex_attr_getters - if tag_map is True: - tag_map = self.tag_map - if lemmatizer is True: - lemmatizer = self.Lemmatizer() - return Vocab.load(self.path, lex_attr_getters=lex_attr_getters, - tag_map=tag_map, lemmatizer=lemmatizer, - serializer_freqs=serializer_freqs) - - def Tokenizer(self, vocab, rules=None, prefix_search=None, suffix_search=None, - infix_finditer=None): - if rules is None: - rules = self.tokenizer_exceptions - if prefix_search is None: - prefix_search = util.compile_prefix_regex(self.prefixes).search - if suffix_search is None: - suffix_search = util.compile_suffix_regex(self.suffixes).search - if infix_finditer is None: - infix_finditer = util.compile_infix_regex(self.infixes).finditer - return Tokenizer(vocab, rules=rules, - prefix_search=prefix_search, suffix_search=suffix_search, - infix_finditer=infix_finditer) - - def Lemmatizer(self): - return Lemmatizer.load(self.path) - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS) diff --git a/spacy/language.py b/spacy/language.py index 8a6e94098..a0400e359 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -26,6 +26,7 @@ from . import orth from .syntax.ner import BiluoPushDown from .syntax.arc_eager import ArcEager from . import util +from .lemmatizer import Lemmatizer from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP @@ -42,18 +43,39 @@ class BaseDefaults(object): self.lex_attr_getters[LANG] = lambda string: lang self.lex_attr_getters[IS_STOP] = lambda string: string in self.stop_words + def Lemmatizer(self): + return Lemmatizer.load(self.path) + def Vectors(self): return True - - def Vocab(self, vectors=None, lex_attr_getters=None): - if lex_attr_getters is None: - lex_attr_getters = dict(self.lex_attr_getters) - if vectors is None: - vectors = self.Vectors() - return Vocab.load(self.path, lex_attr_getters=self.lex_attr_getters, vectors=vectors) - def Tokenizer(self, vocab): - return Tokenizer.load(self.path, vocab) + def Vocab(self, lex_attr_getters=True, tag_map=True, + lemmatizer=True, serializer_freqs=True, vectors=True): + if lex_attr_getters is True: + lex_attr_getters = self.lex_attr_getters + if tag_map is True: + tag_map = self.tag_map + if lemmatizer is True: + lemmatizer = self.Lemmatizer() + if vectors is True: + vectors = self.Vectors() + return Vocab.load(self.path, lex_attr_getters=lex_attr_getters, + tag_map=tag_map, lemmatizer=lemmatizer, + serializer_freqs=serializer_freqs) + + def Tokenizer(self, vocab, rules=None, prefix_search=None, suffix_search=None, + infix_finditer=None): + if rules is None: + rules = self.tokenizer_exceptions + if prefix_search is None: + prefix_search = util.compile_prefix_regex(self.prefixes).search + if suffix_search is None: + suffix_search = util.compile_suffix_regex(self.suffixes).search + if infix_finditer is None: + infix_finditer = util.compile_infix_regex(self.infixes).finditer + return Tokenizer(vocab, rules=rules, + prefix_search=prefix_search, suffix_search=suffix_search, + infix_finditer=infix_finditer) def Tagger(self, vocab): return Tagger.load(self.path / 'pos', vocab)