Merge pull request #802 from Tpt/fr-tokenizer

Adds more French tokenizer exceptions
This commit is contained in:
Ines Montani 2017-02-03 10:52:20 +01:00 committed by GitHub
commit 65d6202107
1 changed files with 39 additions and 1 deletions

View File

@ -41,6 +41,43 @@ ABBREVIATIONS = {
"déc.": [
{LEMMA: "décembre", ORTH: "déc."}
],
"av.": [
{LEMMA: "avant", ORTH: "av."}
],
"apr.": [
{LEMMA: "après", ORTH: "apr."}
],
"J.-C.": [
{LEMMA: "jésus", ORTH: "J."},
{LEMMA: "christ", ORTH: "-C."}
],
"Dr.": [
{LEMMA: "docteur", ORTH: "Dr."}
],
"M.": [
{LEMMA: "monsieur", ORTH: "M."}
],
"Mr.": [
{LEMMA: "monsieur", ORTH: "Mr."}
],
"Mme.": [
{LEMMA: "madame", ORTH: "Mme."}
],
"Mlle.": [
{LEMMA: "mademoiselle", ORTH: "Mlle."}
],
"": [
{LEMMA: "numéro", ORTH: ""}
],
"": [
{LEMMA: "degrés", ORTH: ""}
],
"St.": [
{LEMMA: "saint", ORTH: "St."}
],
"Ste.": [
{LEMMA: "sainte", ORTH: "Ste."}
]
}
@ -52,7 +89,8 @@ INFIXES_EXCEPTIONS_BASE = ["aujourd'hui",
"prud'hommales",
"prud'homie", "prud'homies",
"prud'hommesque", "prud'hommesques",
"prud'hommesquement"]
"prud'hommesquement",
"c'est-à-dire", "quelqu'un", "rendez-vous"]
INFIXES_EXCEPTIONS = []
for elision_char in ELISION: