diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index ba5b86d77..b3150fa2f 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -5,6 +5,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .norm_exceptions import NORM_EXCEPTIONS +from .lex_attrs import LEX_ATTRS from ..norm_exceptions import BASE_NORMS from ...attrs import LANG, NORM @@ -34,6 +35,7 @@ class ThaiTokenizer(DummyTokenizer): class ThaiDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda _text: "th" lex_attr_getters[NORM] = add_lookups( Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS diff --git a/spacy/lang/th/lex_attrs.py b/spacy/lang/th/lex_attrs.py new file mode 100644 index 000000000..047d046c2 --- /dev/null +++ b/spacy/lang/th/lex_attrs.py @@ -0,0 +1,62 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + + +_num_words = [ + "ศูนย์", + "หนึ่ง", + "สอง", + "สาม", + "สี่", + "ห้า", + "หก", + "เจ็ด", + "แปด", + "เก้า", + "สิบ", + "สิบเอ็ด", + "ยี่สิบ", + "ยี่สิบเอ็ด", + "สามสิบ", + "สามสิบเอ็ด", + "สี่สิบ", + "สี่สิบเอ็ด", + "ห้าสิบ", + "ห้าสิบเอ็ด", + "หกสิบเอ็ด", + "เจ็ดสิบ", + "เจ็ดสิบเอ็ด", + "แปดสิบ", + "แปดสิบเอ็ด", + "เก้าสิบ", + "เก้าสิบเอ็ด", + "ร้อย", + "พัน", + "ล้าน", + "พันล้าน", + "หมื่นล้าน", + "แสนล้าน", + "ล้านล้าน", + "ล้านล้านล้าน", + "ล้านล้านล้านล้าน", +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num}