From 8e483ec950bc649b3c440aab90af9e86dae81da0 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 8 May 2017 15:48:04 +0200 Subject: [PATCH] Reorganise Spanish language data --- spacy/es/__init__.py | 28 ++-- spacy/es/{lemmatization.py => lemmatizer.py} | 5 +- spacy/es/tag_map.py | 2 - spacy/es/tokenizer_exceptions.py | 133 +++++++------------ 4 files changed, 71 insertions(+), 97 deletions(-) rename spacy/es/{lemmatization.py => lemmatizer.py} (99%) diff --git a/spacy/es/__init__.py b/spacy/es/__init__.py index 0a22cc711..f975bd11a 100644 --- a/spacy/es/__init__.py +++ b/spacy/es/__init__.py @@ -1,14 +1,17 @@ # coding: utf8 -from __future__ import unicode_literals, print_function +from __future__ import unicode_literals -from os import path +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .tag_map import TAG_MAP +from .stop_words import STOP_WORDS +from .lemmatizer import LOOKUP +from ..language_data import BASE_EXCEPTIONS from ..language import Language -from ..attrs import LANG - -from .language_data import * from ..lemmatizerlookup import Lemmatizer -from .lemmatization import LOOK_UP +from ..attrs import LANG +from ..util import update_exc + class Spanish(Language): lang = 'es' @@ -17,10 +20,13 @@ class Spanish(Language): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'es' - tokenizer_exceptions = TOKENIZER_EXCEPTIONS - tag_map = TAG_MAP - stop_words = STOP_WORDS + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + tag_map = dict(TAG_MAP) + stop_words = set(STOP_WORDS) + + @classmethod + def create_lemmatizer(cls, nlp=None): + return Lemmatizer(LOOKUP) - -EXPORT = Spanish \ No newline at end of file +__all__ = ['Spanish'] diff --git a/spacy/es/lemmatization.py b/spacy/es/lemmatizer.py similarity index 99% rename from spacy/es/lemmatization.py rename to spacy/es/lemmatizer.py index b4f6372ad..1ed648773 100644 --- a/spacy/es/lemmatization.py +++ b/spacy/es/lemmatizer.py @@ -1,7 +1,8 @@ # coding: utf8 from __future__ import unicode_literals -LOOK_UP = { + +LOOKUP = { "aba": "abar", "ababa": "abar", "ababais": "abar", @@ -491549,4 +491550,4 @@ LOOK_UP = { "útiles": "útil", "úveas": "úvea", "úvulas": "úvula" -} \ No newline at end of file +} diff --git a/spacy/es/tag_map.py b/spacy/es/tag_map.py index dce29c921..7d2873926 100644 --- a/spacy/es/tag_map.py +++ b/spacy/es/tag_map.py @@ -1,8 +1,6 @@ # coding: utf8 from __future__ import unicode_literals -from ..symbols import * - TAG_MAP = { "ADJ___": {"morph": "_", "pos": "ADJ"}, diff --git a/spacy/es/tokenizer_exceptions.py b/spacy/es/tokenizer_exceptions.py index e60bcd104..64f1cdbad 100644 --- a/spacy/es/tokenizer_exceptions.py +++ b/spacy/es/tokenizer_exceptions.py @@ -1,113 +1,82 @@ # coding: utf8 from __future__ import unicode_literals -from ..symbols import * -from ..language_data import PRON_LEMMA, DET_LEMMA +from ..symbols import ORTH, LEMMA, TAG, NORM, ADP, DET +from ..deprecated import PRON_LEMMA, DET_LEMMA -TOKENIZER_EXCEPTIONS = { +_exc = { "al": [ {ORTH: "a", LEMMA: "a", TAG: ADP}, - {ORTH: "el", LEMMA: "el", TAG: DET} - ], + {ORTH: "l", LEMMA: "el", TAG: DET}], "consigo": [ {ORTH: "con", LEMMA: "con"}, - {ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: "sí"} - ], + {ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: "sí"}], "conmigo": [ {ORTH: "con", LEMMA: "con"}, - {ORTH: "migo", LEMMA: PRON_LEMMA, NORM: "mí"} - ], + {ORTH: "migo", LEMMA: PRON_LEMMA, NORM: "mí"}], "contigo": [ {ORTH: "con", LEMMA: "con"}, - {ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"} - ], + {ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"}], "del": [ {ORTH: "de", LEMMA: "de", TAG: ADP}, - {ORTH: "l", LEMMA: "el", TAG: DET} - ], + {ORTH: "l", LEMMA: "el", TAG: DET}], "pel": [ {ORTH: "pe", LEMMA: "per", TAG: ADP}, - {ORTH: "l", LEMMA: "el", TAG: DET} - ], + {ORTH: "l", LEMMA: "el", TAG: DET}], "pal": [ {ORTH: "pa", LEMMA: "para"}, - {ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"} - ], + {ORTH: "l", LEMMA: DET_LEMMA, NORM: "el"}], "pala": [ {ORTH: "pa", LEMMA: "para"}, - {ORTH: "la", LEMMA: DET_LEMMA} - ], - - "aprox.": [ - {ORTH: "aprox.", LEMMA: "aproximadamente"} - ], - - "dna.": [ - {ORTH: "dna.", LEMMA: "docena"} - ], - - "esq.": [ - {ORTH: "esq.", LEMMA: "esquina"} - ], - - "pág.": [ - {ORTH: "pág.", LEMMA: "página"} - ], - - "p.ej.": [ - {ORTH: "p.ej.", LEMMA: "por ejemplo"} - ], - - "Ud.": [ - {ORTH: "Ud.", LEMMA: PRON_LEMMA, NORM: "usted"} - ], - - "Vd.": [ - {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"} - ], - - "Uds.": [ - {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"} - ], - - "Vds.": [ - {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"} - ] + {ORTH: "la", LEMMA: DET_LEMMA}] } -ORTH_ONLY = [ - "a.C.", - "a.J.C.", - "apdo.", - "Av.", - "Avda.", - "Cía.", - "etc.", - "Gob.", - "Gral.", - "Ing.", - "J.C.", - "Lic.", - "m.n.", - "no.", - "núm.", - "P.D.", - "Prof.", - "Profa.", - "q.e.p.d." - "S.A.", - "S.L.", - "s.s.s.", - "Sr.", - "Sra.", - "Srta." -] +for exc_data in [ + {ORTH: "aprox.", LEMMA: "aproximadamente"}, + {ORTH: "dna.", LEMMA: "docena"}, + {ORTH: "esq.", LEMMA: "esquina"}, + {ORTH: "pág.", LEMMA: "página"}, + {ORTH: "p.ej.", LEMMA: "por ejemplo"}, + {ORTH: "Ud.", LEMMA: PRON_LEMMA, NORM: "usted"}, + {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"}, + {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, + {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}]: + _exc[exc_data[ORTH]] = [dict(exc_data)] + + +# Times + +_exc["12m."] = [ + {ORTH: "12"}, + {ORTH: "m.", LEMMA: "p.m."}] + + +for h in range(1, 12 + 1): + hour = str(h) + for period in ["a.m.", "am"]: + _exc[hour+period] = [ + {ORTH: hour}, + {ORTH: period, LEMMA: "a.m."}] + for period in ["p.m.", "pm"]: + _exc[hour+period] = [ + {ORTH: hour}, + {ORTH: period, LEMMA: "p.m."}] + + +for orth in [ + "a.C.", "a.J.C.", "apdo.", "Av.", "Avda.", "Cía.", "etc.", "Gob.", "Gral.", + "Ing.", "J.C.", "Lic.", "m.n.", "no.", "núm.", "P.D.", "Prof.", "Profa.", + "q.e.p.d.", "S.A.", "S.L.", "s.s.s.", "Sr.", "Sra.", "Srta."]: + _exc[orth] = [{ORTH: orth}] + + +TOKENIZER_EXCEPTIONS = dict(_exc)