2016-11-02 19:02:41 +00:00
|
|
|
# encoding: utf8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2016-12-08 19:41:41 +00:00
|
|
|
from ..symbols import *
|
|
|
|
from ..language_data import TOKENIZER_PREFIXES
|
|
|
|
from ..language_data import TOKENIZER_SUFFIXES
|
|
|
|
from ..language_data import TOKENIZER_INFIXES
|
2016-11-02 19:02:41 +00:00
|
|
|
|
|
|
|
|
2016-12-08 19:41:41 +00:00
|
|
|
def strings_to_exc(orths):
|
|
|
|
return {orth: [{ORTH: orth}] for orth in orths}
|
2016-11-02 19:02:41 +00:00
|
|
|
|
|
|
|
|
2016-12-08 19:41:41 +00:00
|
|
|
PRON_LEMMA = "-PRON-"
|
2016-11-02 19:02:41 +00:00
|
|
|
|
|
|
|
|
2016-12-08 19:41:41 +00:00
|
|
|
TAG_MAP = {
|
2016-11-02 19:02:41 +00:00
|
|
|
|
2016-12-08 19:41:41 +00:00
|
|
|
}
|
2016-11-02 19:02:41 +00:00
|
|
|
|
2016-12-08 19:41:41 +00:00
|
|
|
STOP_WORDS = set("""
|
2016-11-02 19:02:41 +00:00
|
|
|
|
2016-12-08 19:41:41 +00:00
|
|
|
""".split())
|
2016-11-02 19:02:41 +00:00
|
|
|
|
|
|
|
|
2016-12-08 19:41:41 +00:00
|
|
|
TOKENIZER_EXCEPTIONS = {
|
2016-11-02 19:02:41 +00:00
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2016-12-08 19:41:41 +00:00
|
|
|
ORTH_ONLY = {
|
|
|
|
|
2016-11-02 19:02:41 +00:00
|
|
|
}
|