From 54fcc5bba2e7eacfc7882082dbc5770e8968e018 Mon Sep 17 00:00:00 2001 From: thjbbvlt Date: Tue, 26 Mar 2024 11:25:43 +0100 Subject: [PATCH] add dot after some abbrevs --- spacy/lang/fr/tokenizer_exceptions.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index 1ae0075f4..0c39fdfc7 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -4,15 +4,19 @@ from ..tokenizer_exceptions import BASE_EXCEPTIONS _exc = { "St": [{ORTH: "St", NORM: "Saint"}], + "St.": [{ORTH: "St.", NORM: "Saint"}], "Ste": [{ORTH: "Ste", NORM: "Sainte"}], "Mme": [{ORTH: "Mme", NORM: "Madame"}], "Mr": [{ORTH: "Mr", NORM: "Monsieur"}], + "Mr.": [{ORTH: "Mr.", NORM: "Monsieur"}], "M.": [{ORTH: "M.", NORM: "Monsieur"}], "Mlle": [{ORTH: "Mlle", NORM: "Mademoiselle"}], "Dr": [{ORTH: "Dr", NORM: "Docteur"}], + "Dr.": [{ORTH: "Dr.", NORM: "Docteur"}], "Dresse": [{ORTH: "Dresse", NORM: "Doctoresse"}], "Drsse": [{ORTH: "Drsse", NORM: "Doctoresse"}], "etc": [{ORTH: "etc", NORM: "etcaetera"}], + "etc.": [{ORTH: "etc.", NORM: "etcaetera"}], # months "jan.": [{ORTH: "jan.", NORM: "janvier"}], "janv.": [{ORTH: "janv.", NORM: "janvier"}],