mirror of https://github.com/explosion/spaCy.git
Import and combine Portuguese tokenizer exceptions (see #943)
This commit is contained in:
parent
f8b2d9c3b7
commit
ad8bf1829f
|
@ -5,13 +5,15 @@ from .. import language_data as base
|
||||||
from ..language_data import update_exc, strings_to_exc
|
from ..language_data import update_exc, strings_to_exc
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
|
||||||
|
|
||||||
STOP_WORDS = set(STOP_WORDS)
|
STOP_WORDS = set(STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||||
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
|
||||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||||
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
||||||
|
|
Loading…
Reference in New Issue