Add Thai lex_attrs (#3655)

* test sPacy commit to git fri 04052019 10:54 * change Data format from my format to master format * ทัทั้งนี้ ---> ทั้งนี้ * delete stop_word translate from Eng * Adjust formatting and readability * add Thai norm_exception * Add Dobita21 SCA * editรึ : หรือ, * Update Dobita21.md * Auto-format * Integrate norms into language defaults * add acronym and some norm exception words * add lex_attrs * Add lexical attribute getters into the language defaults * fix LEX_ATTRS Co-authored-by: Donut <dobita21@gmail.com> Co-authored-by: Ines Montani <ines@ines.io>
2019-05-01 17:03:14 +07:00 · 2019-05-01 17:03:14 +07:00 · f95ecedd83
parent ba1ff00370
commit f95ecedd83
2 changed files with 64 additions and 0 deletions
--- a/spacy/lang/th/init.py
+++ b/spacy/lang/th/init.py
@ -5,6 +5,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .norm_exceptions import NORM_EXCEPTIONS
+from .lex_attrs import LEX_ATTRS

 from ..norm_exceptions import BASE_NORMS
 from ...attrs import LANG, NORM
@ -34,6 +35,7 @@ class ThaiTokenizer(DummyTokenizer):

 class ThaiDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda _text: "th"
    lex_attr_getters[NORM] = add_lookups(
        Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
--- a/spacy/lang/th/lex_attrs.py
+++ b/spacy/lang/th/lex_attrs.py
@ -0,0 +1,62 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...attrs import LIKE_NUM
+
+
+_num_words = [
+    "ศูนย์",
+    "หนึ่ง",
+    "สอง",
+    "สาม",
+    "สี่",
+    "ห้า",
+    "หก",
+    "เจ็ด",
+    "แปด",
+    "เก้า",
+    "สิบ",
+    "สิบเอ็ด",
+    "ยี่สิบ",
+    "ยี่สิบเอ็ด",
+    "สามสิบ",
+    "สามสิบเอ็ด",
+    "สี่สิบ",
+    "สี่สิบเอ็ด",
+    "ห้าสิบ",
+    "ห้าสิบเอ็ด",
+    "หกสิบเอ็ด",
+    "เจ็ดสิบ",
+    "เจ็ดสิบเอ็ด",
+    "แปดสิบ",
+    "แปดสิบเอ็ด",
+    "เก้าสิบ",
+    "เก้าสิบเอ็ด",
+    "ร้อย",
+    "พัน",
+    "ล้าน",
+    "พันล้าน",
+    "หมื่นล้าน",
+    "แสนล้าน",
+    "ล้านล้าน",
+    "ล้านล้านล้าน",
+    "ล้านล้านล้านล้าน",
+]
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text in _num_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}