spaCy/spacy/lang/tr/lex_attrs.py

90 lines
1.6 KiB
Python
Raw Normal View History

2018-03-08 14:25:25 +00:00
from ...attrs import LIKE_NUM
# Thirteen, fifteen etc. are written separate: on üç
2019-02-07 19:54:07 +00:00
_num_words = [
"bir",
"iki",
"üç",
"dört",
"beş",
"altı",
"yedi",
"sekiz",
"dokuz",
"on",
"yirmi",
"otuz",
"kırk",
"elli",
"altmış",
"yetmiş",
"seksen",
"doksan",
"yüz",
"bin",
"milyon",
"milyar",
2019-02-07 19:54:07 +00:00
"trilyon",
"katrilyon",
"kentilyon",
]
2018-03-08 14:25:25 +00:00
_ordinal_words = [
"birinci",
"ikinci",
"üçüncü",
"dördüncü",
"beşinci",
"altıncı",
"yedinci",
"sekizinci",
"dokuzuncu",
"onuncu",
"yirminci",
"otuzuncu",
"kırkıncı",
"ellinci",
"altmışıncı",
"yetmişinci",
"sekseninci",
"doksanıncı",
"yüzüncü",
"bininci",
"mliyonuncu",
"milyarıncı",
"trilyonuncu",
"katrilyonuncu",
"kentilyonuncu",
]
_ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü")
2020-10-10 17:14:48 +00:00
2018-03-08 14:25:25 +00:00
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
2018-03-08 14:25:25 +00:00
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
2018-03-08 14:25:25 +00:00
if num.isdigit() and denom.isdigit():
return True
text_lower = text.lower()
2020-10-10 17:14:48 +00:00
# Check cardinal number
if text_lower in _num_words:
2018-03-08 14:25:25 +00:00
return True
2020-10-10 17:14:48 +00:00
# Check ordinal number
if text_lower in _ordinal_words:
return True
if text_lower.endswith(_ordinal_endings):
if text_lower[:-3].isdigit() or text_lower[:-4].isdigit():
return True
2018-03-08 14:25:25 +00:00
return False
LEX_ATTRS = {LIKE_NUM: like_num}