spaCy/spacy/bn/lemmatizer.py

69 lines
1.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding: utf8
from __future__ import unicode_literals
# Source: āĻ‰āĻšā§āĻšāĻ¤āĻ° āĻŦāĻžāĻ‚āĻ˛āĻž āĻŦā§āĻ¯āĻžāĻ•āĻ°āĻŖ āĻ“ āĻ°āĻšāĻ¨āĻž - āĻ…āĻ§ā§āĻ¯āĻžāĻĒāĻ• āĻ¨āĻŋāĻ°āĻžā§āĻœāĻ¨ āĻ…āĻ§āĻŋāĻ•āĻžāĻ°ā§€ āĻ“ āĻ…āĻ§ā§āĻ¯āĻžāĻĒāĻ• āĻĄ. āĻ¸āĻĢāĻŋāĻ‰āĻĻā§āĻĻāĻŋāĻ¨ āĻ†āĻšāĻŽāĻĻ
LEMMA_RULES = {
"noun": [
["āĻŸāĻž", ""],
["āĻŸāĻŋ", ""],
["āĻ–āĻžāĻ¨", ""],
["āĻ–āĻžāĻ¨āĻž", ""],
["āĻ–āĻžāĻ¨āĻŋ", ""],
["āĻ—āĻžāĻ›āĻž", ""],
["āĻ—āĻžāĻ›āĻŋ", ""],
["āĻ›āĻĄāĻŧāĻž", ""],
["āĻ•ā§‡", ""],
["ā§‡", ""],
["āĻ¤ā§‡", ""],
["āĻ°", ""],
["āĻ°āĻž", ""],
["āĻ°ā§‡", ""],
["ā§‡āĻ°", ""], # āĻāĻ°
["ā§‡āĻ°āĻž", ""], # āĻāĻ°āĻž
["āĻĻā§‡āĻ°", ""],
["āĻĻā§‡āĻ°āĻ•ā§‡", ""],
["āĻ—ā§āĻ˛āĻž", ""],
["āĻ—ā§āĻ˛ā§‹", ""],
["āĻ—ā§āĻ˛āĻŋ", ""],
["āĻ•ā§āĻ˛", ""],
["āĻ—āĻŖ", ""],
["āĻĻāĻ˛", ""],
["āĻĒāĻžāĻ˛", ""],
["āĻĒā§āĻžā§āĻœ", ""],
["āĻŽāĻŖā§āĻĄāĻ˛ā§€", ""],
["āĻŽāĻžāĻ˛āĻž", ""],
["āĻ°āĻžāĻœāĻŋ", ""],
["āĻŦā§ƒāĻ¨ā§āĻĻ", ""],
["āĻŦāĻ°ā§āĻ—", ""],
["āĻļā§āĻ°ā§‡āĻŖā§€", ""],
["āĻļā§āĻ°ā§‡āĻ¨āĻŋ", ""],
["āĻ°āĻžāĻļāĻŋ", ""],
["āĻ¸āĻ•āĻ˛", ""],
["āĻŽāĻšāĻ˛", ""],
["āĻžāĻŦāĻ˛āĻŋ", ""], # āĻ†āĻŦāĻ˛āĻŋ
# Bengali digit representations
["ā§Ļ", "0"],
["ā§§", "1"],
["ā§¨", "2"],
["ā§Š", "3"],
["ā§Ē", "4"],
["ā§Ģ", "5"],
["ā§Ŧ", "6"],
["ā§­", "7"],
["ā§Ž", "8"],
["ā§¯", "9"],
],
"punct": [
["“", "\""],
["”", "\""],
["\u2018", "'"],
["\u2019", "'"]
]
}