mirror of https://github.com/explosion/spaCy.git
107 lines
2.6 KiB
Python
107 lines
2.6 KiB
Python
# encoding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from .. import language_data as base
|
|
from ..language_data import strings_to_exc, update_exc
|
|
|
|
from .punctuation import ELISION
|
|
|
|
from ..symbols import *
|
|
from .stop_words import STOP_WORDS
|
|
|
|
|
|
STOP_WORDS = set(STOP_WORDS)
|
|
|
|
|
|
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
|
|
|
ABBREVIATIONS = {
|
|
"janv.": [
|
|
{LEMMA: "janvier", ORTH: "janv."}
|
|
],
|
|
"févr.": [
|
|
{LEMMA: "février", ORTH: "févr."}
|
|
],
|
|
"avr.": [
|
|
{LEMMA: "avril", ORTH: "avr."}
|
|
],
|
|
"juill.": [
|
|
{LEMMA: "juillet", ORTH: "juill."}
|
|
],
|
|
"sept.": [
|
|
{LEMMA: "septembre", ORTH: "sept."}
|
|
],
|
|
"oct.": [
|
|
{LEMMA: "octobre", ORTH: "oct."}
|
|
],
|
|
"nov.": [
|
|
{LEMMA: "novembre", ORTH: "nov."}
|
|
],
|
|
"déc.": [
|
|
{LEMMA: "décembre", ORTH: "déc."}
|
|
],
|
|
"av.": [
|
|
{LEMMA: "avant", ORTH: "av."}
|
|
],
|
|
"apr.": [
|
|
{LEMMA: "après", ORTH: "apr."}
|
|
],
|
|
"J.-C.": [
|
|
{LEMMA: "jésus", ORTH: "J."},
|
|
{LEMMA: "christ", ORTH: "-C."}
|
|
],
|
|
"Dr.": [
|
|
{LEMMA: "docteur", ORTH: "Dr."}
|
|
],
|
|
"M.": [
|
|
{LEMMA: "monsieur", ORTH: "M."}
|
|
],
|
|
"Mr.": [
|
|
{LEMMA: "monsieur", ORTH: "Mr."}
|
|
],
|
|
"Mme.": [
|
|
{LEMMA: "madame", ORTH: "Mme."}
|
|
],
|
|
"Mlle.": [
|
|
{LEMMA: "mademoiselle", ORTH: "Mlle."}
|
|
],
|
|
"n°": [
|
|
{LEMMA: "numéro", ORTH: "n°"}
|
|
],
|
|
"d°": [
|
|
{LEMMA: "degrés", ORTH: "d°"}
|
|
],
|
|
"St.": [
|
|
{LEMMA: "saint", ORTH: "St."}
|
|
],
|
|
"Ste.": [
|
|
{LEMMA: "sainte", ORTH: "Ste."}
|
|
]
|
|
}
|
|
|
|
|
|
INFIXES_EXCEPTIONS_BASE = ["aujourd'hui",
|
|
"prud'homme", "prud'hommes",
|
|
"prud'homal", "prud'homaux", "prud'homale",
|
|
"prud'homales",
|
|
"prud'hommal", "prud'hommaux", "prud'hommale",
|
|
"prud'hommales",
|
|
"prud'homie", "prud'homies",
|
|
"prud'hommesque", "prud'hommesques",
|
|
"prud'hommesquement",
|
|
"c'est-à-dire", "quelqu'un", "rendez-vous"]
|
|
|
|
INFIXES_EXCEPTIONS = []
|
|
for elision_char in ELISION:
|
|
INFIXES_EXCEPTIONS += [infix.replace("'", elision_char)
|
|
for infix in INFIXES_EXCEPTIONS_BASE]
|
|
|
|
INFIXES_EXCEPTIONS += [word.capitalize() for word in INFIXES_EXCEPTIONS]
|
|
|
|
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(INFIXES_EXCEPTIONS))
|
|
update_exc(TOKENIZER_EXCEPTIONS, ABBREVIATIONS)
|
|
|
|
|
|
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|