Expand tokenizer exceptions with unicode apostrophe (fixes #685)

This commit is contained in:
Ines Montani 2016-12-17 12:34:08 +01:00
parent e0a7b5c612
commit 5445074cbd
1 changed files with 2 additions and 0 deletions

View File

@ -13,6 +13,7 @@ from ..attrs import LANG
from ..language_data import update_exc from ..language_data import update_exc
from ..language_data import strings_to_exc from ..language_data import strings_to_exc
from ..language_data import expand_exc
from ..language_data import EMOTICONS from ..language_data import EMOTICONS
from .language_data import ORTH_ONLY from .language_data import ORTH_ONLY
@ -30,6 +31,7 @@ STOP_WORDS = set(language_data.STOP_WORDS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1))) update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", ""))
class English(Language): class English(Language):