From 5d706ab95d6104f9a06d139be67d7507e7575486 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Thu, 9 Feb 2017 16:30:16 +0100 Subject: [PATCH] Merge tokenizer exceptions from PR #802 --- spacy/fr/resources/tokenizer_exceptions | 1 - spacy/fr/tokenizer_exceptions.py | 41 +++++++++++++++++++++++-- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/spacy/fr/resources/tokenizer_exceptions b/spacy/fr/resources/tokenizer_exceptions index 34444c61d..4f29d8e6d 100644 --- a/spacy/fr/resources/tokenizer_exceptions +++ b/spacy/fr/resources/tokenizer_exceptions @@ -13039,7 +13039,6 @@ Javerlhac-et-la-Chapelle-Saint-Robert Javron-les-Chapelles JAX-RPC JAX-RS -J.-C. Jeannois-Mitissien jeans-de-gand jeans-de-janten diff --git a/spacy/fr/tokenizer_exceptions.py b/spacy/fr/tokenizer_exceptions.py index c1dbe7d59..81a752755 100644 --- a/spacy/fr/tokenizer_exceptions.py +++ b/spacy/fr/tokenizer_exceptions.py @@ -75,15 +75,49 @@ def get_tokenizer_exceptions(): "déc.": [ {LEMMA: "décembre", ORTH: "déc."} ], + "apr.": [ + {LEMMA: "après", ORTH: "apr."} + ], + "J.-C.": [ + {LEMMA: "Jésus", ORTH: "J."}, + {LEMMA: "Christ", ORTH: "-C."} + ], + "Dr.": [ + {LEMMA: "docteur", ORTH: "Dr."} + ], + "M.": [ + {LEMMA: "monsieur", ORTH: "M."} + ], + "Mr.": [ + {LEMMA: "monsieur", ORTH: "Mr."} + ], + "Mme.": [ + {LEMMA: "madame", ORTH: "Mme."} + ], + "Mlle.": [ + {LEMMA: "mademoiselle", ORTH: "Mlle."} + ], + "n°": [ + {LEMMA: "numéro", ORTH: "n°"} + ], + "d°": [ + {LEMMA: "degrés", ORTH: "d°"} + ], + "St.": [ + {LEMMA: "saint", ORTH: "St."} + ], + "Ste.": [ + {LEMMA: "sainte", ORTH: "Ste."} + ] } ABBREVIATIONS_2 = [ - "Dr.", "etc.", ] VERBS = {} - for verb, verb_lemma in (("a", "avoir"), ("est", "être"), ("semble", "sembler"), ("indique", "indiquer"), + for verb, verb_lemma in (("a", "avoir"), ("est", "être"), + ("semble", "sembler"), ("indique", "indiquer"), ("moque", "moquer"), ("passe", "passer")): for pronoun in ("elle", "il", "on"): token = "{}-t-{}".format(verb, pronoun) @@ -98,7 +132,8 @@ def get_tokenizer_exceptions(): {LEMMA: 'ce', ORTH: '-ce'} ] - for pre, pre_lemma in (("qu'", "que"), ("Qu'", "Que"), ("n'", "ne"), ("N'", "Ne")): + for pre, pre_lemma in (("qu'", "que"), ("Qu'", "Que"), ("n'", "ne"), + ("N'", "Ne")): VERBS['{}est-ce'.format(pre)] = [ {LEMMA: pre_lemma, ORTH: pre}, {LEMMA: 'être', ORTH: "est"},