spaCy/spacy/lang/bn/lemmatizer.py

59 lines
1.7 KiB
Python
Raw Normal View History

2017-03-12 12:07:28 +00:00
# coding: utf8
from __future__ import unicode_literals
# Source: উচ্চতর বাংলা ব্যাকরণ ও রচনা - অধ্যাপক নিরঞ্জন অধিকারী ও অধ্যাপক ড. সফিউদ্দিন আহমদ
LEMMA_RULES = {
"noun": [
["টা", ""],
["টি", ""],
["খান", ""],
["খানা", ""],
["খানি", ""],
["গাছা", ""],
["গাছি", ""],
["ছড়া", ""],
["কে", ""],
["", ""],
["তে", ""],
["", ""],
["রা", ""],
["রে", ""],
["ের", ""], # এর
["েরা", ""], # এরা
["দের", ""],
["দেরকে", ""],
["গুলা", ""],
["গুলো", ""],
["গুলি", ""],
["কুল", ""],
["গণ", ""],
["দল", ""],
["পাল", ""],
["পুঞ্জ", ""],
["মণ্ডলী", ""],
["মালা", ""],
["রাজি", ""],
["বৃন্দ", ""],
["বর্গ", ""],
["শ্রেণী", ""],
["শ্রেনি", ""],
["রাশি", ""],
["সকল", ""],
["মহল", ""],
["াবলি", ""], # আবলি
# Bengali digit representations
["", "0"],
["", "1"],
["", "2"],
["", "3"],
["", "4"],
["", "5"],
["", "6"],
["", "7"],
["", "8"],
["", "9"],
],
"punct": [["", '"'], ["", '"'], ["\u2018", "'"], ["\u2019", "'"]],
}