fix tokenizer_exceptions in thai

This commit is contained in:
Wannaphong Phatthiyaphaibun 2017-09-26 22:14:47 +07:00
parent a2bf4cc7bf
commit 2ea27d07f4
1 changed files with 1 additions and 38 deletions

View File

@ -1,9 +1,7 @@
# encoding: utf8 # encoding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..symbols import * from ...symbols import *
from ..language_data import PRON_LEMMA
TOKENIZER_EXCEPTIONS = { TOKENIZER_EXCEPTIONS = {
"ม.ค.": [ "ม.ค.": [
@ -43,38 +41,3 @@ TOKENIZER_EXCEPTIONS = {
{ORTH: "ธ.ค.", LEMMA: "ธันวาคม"} {ORTH: "ธ.ค.", LEMMA: "ธันวาคม"}
] ]
} }
# exceptions mapped to a single token containing only ORTH property
# example: {"string": [{ORTH: "string"}]}
# converted using strings_to_exc() util
'''
ORTH_ONLY = [
"a.",
"b.",
"c.",
"d.",
"e.",
"f.",
"g.",
"h.",
"i.",
"j.",
"k.",
"l.",
"m.",
"n.",
"o.",
"p.",
"q.",
"r.",
"s.",
"t.",
"u.",
"v.",
"w.",
"x.",
"y.",
"z."
]
'''