mirror of https://github.com/explosion/spaCy.git
Add Thai norm_exceptions (#3612)
* test sPacy commit to git fri 04052019 10:54 * change Data format from my format to master format * ทัทั้งนี้ ---> ทั้งนี้ * delete stop_word translate from Eng * Adjust formatting and readability * add Thai norm_exception * Add Dobita21 SCA * editรึ : หรือ, * Update Dobita21.md * Auto-format * Integrate norms into language defaults
This commit is contained in:
parent
7937109ed9
commit
189c90743c
|
@ -4,11 +4,13 @@ from __future__ import unicode_literals
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
|
||||
from ...attrs import LANG
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...attrs import LANG, NORM
|
||||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from ...util import DummyTokenizer
|
||||
from ...util import DummyTokenizer, add_lookups
|
||||
|
||||
|
||||
class ThaiTokenizer(DummyTokenizer):
|
||||
|
@ -33,7 +35,9 @@ class ThaiTokenizer(DummyTokenizer):
|
|||
class ThaiDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda _text: "th"
|
||||
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
|
||||
)
|
||||
tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
|
|
|
@ -0,0 +1,86 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
_exc = {
|
||||
# Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์)
|
||||
"สนุ๊กเกอร์": "สนุกเกอร์",
|
||||
"โน้ต": "โน้ต",
|
||||
# Misspelled because of being lazy or hustle (สะกดผิดเพราะขี้เกียจพิมพ์ หรือเร่งรีบ)
|
||||
"โทสับ": "โทรศัพท์",
|
||||
"พุ่งนี้": "พรุ่งนี้",
|
||||
# Strange (ให้ดูแปลกตา)
|
||||
"ชะมะ": "ใช่ไหม",
|
||||
"ชิมิ": "ใช่ไหม",
|
||||
"ชะ": "ใช่ไหม",
|
||||
"ช่ายมะ": "ใช่ไหม",
|
||||
"ป่าว": "เปล่า",
|
||||
"ป่ะ": "เปล่า",
|
||||
"ปล่าว": "เปล่า",
|
||||
"คัย": "ใคร",
|
||||
"ไค": "ใคร",
|
||||
"คราย": "ใคร",
|
||||
"เตง": "ตัวเอง",
|
||||
"ตะเอง": "ตัวเอง",
|
||||
"รึ": "หรือ",
|
||||
"เหรอ": "หรือ",
|
||||
"หรา": "หรือ",
|
||||
"หรอ": "หรือ",
|
||||
"ชั้น": "ฉัน",
|
||||
"ชั้ล": "ฉัน",
|
||||
"ช้าน": "ฉัน",
|
||||
"เทอ": "เธอ",
|
||||
"เทอร์": "เธอ",
|
||||
"เทอว์": "เธอ",
|
||||
"แกร": "แก",
|
||||
"ป๋ม": "ผม",
|
||||
"บ่องตง": "บอกตรงๆ",
|
||||
"ถ่ามตง": "ถามตรงๆ",
|
||||
"ต่อมตง": "ตอบตรงๆ",
|
||||
# Misspelled to express emotions (คำที่สะกดผิดเพื่อแสดงอารมณ์)
|
||||
"เปงราย": "เป็นอะไร",
|
||||
"เปนรัย": "เป็นอะไร",
|
||||
"เปงรัย": "เป็นอะไร",
|
||||
"เป็นอัลไล": "เป็นอะไร",
|
||||
"ทามมาย": "ทำไม",
|
||||
"ทามมัย": "ทำไม",
|
||||
"จังรุย": "จังเลย",
|
||||
"จังเยย": "จังเลย",
|
||||
"จุงเบย": "จังเลย",
|
||||
"ไม่รู้": "มะรุ",
|
||||
"เฮ่ย": "เฮ้ย",
|
||||
"เห้ย": "เฮ้ย",
|
||||
"น่าร็อคอ่ะ": "น่ารักอ่ะ",
|
||||
"น่าร๊ากอ้ะ": "น่ารักอ่ะ",
|
||||
"ตั้ลล๊ากอ่ะ": "น่ารักอ่ะ",
|
||||
# Reduce rough words or Avoid to software filter (คำที่สะกดผิดเพื่อลดความหยาบของคำ หรืออาจใช้หลีกเลี่ยงการกรองคำหยาบของซอฟต์แวร์)
|
||||
"กรู": "กู",
|
||||
"กุ": "กู",
|
||||
"กรุ": "กู",
|
||||
"ตู": "กู",
|
||||
"ตรู": "กู",
|
||||
"มรึง": "มึง",
|
||||
"เมิง": "มึง",
|
||||
"มืง": "มึง",
|
||||
"มุง": "มึง",
|
||||
"สาด": "สัตว์",
|
||||
"สัส": "สัตว์",
|
||||
"สัก": "สัตว์",
|
||||
"แสรด": "สัตว์",
|
||||
"โคโตะ": "โคตร",
|
||||
"โคด": "โคตร",
|
||||
"โครต": "โคตร",
|
||||
"โคตะระ": "โคตร",
|
||||
# Imitate words (คำเลียนเสียง โดยส่วนใหญ่จะเพิ่มทัณฑฆาต หรือซ้ำตัวอักษร)
|
||||
"แอร๊ยย": "อ๊าย",
|
||||
"อร๊ายยย": "อ๊าย",
|
||||
"มันส์": "มัน",
|
||||
"วู๊วววววววว์": "วู้",
|
||||
}
|
||||
|
||||
|
||||
NORM_EXCEPTIONS = {}
|
||||
|
||||
for string, norm in _exc.items():
|
||||
NORM_EXCEPTIONS[string] = norm
|
||||
NORM_EXCEPTIONS[string.title()] = norm
|
Loading…
Reference in New Issue