From f95ecedd83ce75b7062af6afaf47f9ed6fe59550 Mon Sep 17 00:00:00 2001 From: Dobita21 <39238314+Dobita21@users.noreply.github.com> Date: Wed, 1 May 2019 17:03:14 +0700 Subject: [PATCH] Add Thai lex_attrs (#3655) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * test sPacy commit to git fri 04052019 10:54 * change Data format from my format to master format * ทัทั้งนี้ ---> ทั้งนี้ * delete stop_word translate from Eng * Adjust formatting and readability * add Thai norm_exception * Add Dobita21 SCA * editรึ : หรือ, * Update Dobita21.md * Auto-format * Integrate norms into language defaults * add acronym and some norm exception words * add lex_attrs * Add lexical attribute getters into the language defaults * fix LEX_ATTRS Co-authored-by: Donut Co-authored-by: Ines Montani --- spacy/lang/th/__init__.py | 2 ++ spacy/lang/th/lex_attrs.py | 62 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 spacy/lang/th/lex_attrs.py diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index ba5b86d77..b3150fa2f 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -5,6 +5,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .norm_exceptions import NORM_EXCEPTIONS +from .lex_attrs import LEX_ATTRS from ..norm_exceptions import BASE_NORMS from ...attrs import LANG, NORM @@ -34,6 +35,7 @@ class ThaiTokenizer(DummyTokenizer): class ThaiDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda _text: "th" lex_attr_getters[NORM] = add_lookups( Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS diff --git a/spacy/lang/th/lex_attrs.py b/spacy/lang/th/lex_attrs.py new file mode 100644 index 000000000..047d046c2 --- /dev/null +++ b/spacy/lang/th/lex_attrs.py @@ -0,0 +1,62 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + + +_num_words = [ + "ศูนย์", + "หนึ่ง", + "สอง", + "สาม", + "สี่", + "ห้า", + "หก", + "เจ็ด", + "แปด", + "เก้า", + "สิบ", + "สิบเอ็ด", + "ยี่สิบ", + "ยี่สิบเอ็ด", + "สามสิบ", + "สามสิบเอ็ด", + "สี่สิบ", + "สี่สิบเอ็ด", + "ห้าสิบ", + "ห้าสิบเอ็ด", + "หกสิบเอ็ด", + "เจ็ดสิบ", + "เจ็ดสิบเอ็ด", + "แปดสิบ", + "แปดสิบเอ็ด", + "เก้าสิบ", + "เก้าสิบเอ็ด", + "ร้อย", + "พัน", + "ล้าน", + "พันล้าน", + "หมื่นล้าน", + "แสนล้าน", + "ล้านล้าน", + "ล้านล้านล้าน", + "ล้านล้านล้านล้าน", +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num}