Wire up English lemma and morph rules.

This commit is contained in:
Matthew Honnibal 2017-03-15 09:23:22 -05:00
parent f70be44746
commit 8dbff4f5f4
3 changed files with 7 additions and 2 deletions

View File

@ -31,6 +31,7 @@ class English(Language):
tag_map = TAG_MAP tag_map = TAG_MAP
stop_words = STOP_WORDS stop_words = STOP_WORDS
morph_rules = dict(MORPH_RULES)
lemma_rules = dict(LEMMA_RULES) lemma_rules = dict(LEMMA_RULES)
lemma_index = dict(LEMMA_INDEX) lemma_index = dict(LEMMA_INDEX)
lemma_exc = dict(LEMMA_EXC) lemma_exc = dict(LEMMA_EXC)

View File

@ -9,6 +9,9 @@ from .tag_map import TAG_MAP
from .word_sets import STOP_WORDS, NUM_WORDS from .word_sets import STOP_WORDS, NUM_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
from .morph_rules import MORPH_RULES from .morph_rules import MORPH_RULES
from .lemmatizer import RULES as LEMMA_RULES
from .lemmatizer import INDEX as LEMMA_INDEX
from .lemmatizer import EXC as LEMMA_EXC
TAG_MAP = dict(TAG_MAP) TAG_MAP = dict(TAG_MAP)
@ -22,4 +25,5 @@ update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "MORPH_RULES"] __all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "MORPH_RULES",
"LEMMA_RULES", "LEMMA_INDEX", "LEMMA_EXC"]

View File

@ -770,5 +770,5 @@ ORTH_ONLY = [
"Rev.", "Rev.",
"Sen.", "Sen.",
"St.", "St.",
"vs." "vs.",
] ]