diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 90763eda5..06d146748 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals from .stop_words import STOP_WORDS from .tag_map import TAG_MAP from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .punctuation import TOKENIZER_INFIXES +from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -22,6 +22,7 @@ class ItalianDefaults(Language.Defaults): tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS tag_map = TAG_MAP + prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES diff --git a/spacy/lang/it/punctuation.py b/spacy/lang/it/punctuation.py index 4fa931fde..f2c1fd84a 100644 --- a/spacy/lang/it/punctuation.py +++ b/spacy/lang/it/punctuation.py @@ -1,15 +1,39 @@ # coding: utf8 from __future__ import unicode_literals -from ..punctuation import TOKENIZER_INFIXES -from ..char_classes import ALPHA +from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES +from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import ALPHA, HYPHENS, CONCAT_QUOTES +from ..char_classes import ALPHA_LOWER, ALPHA_UPPER -ELISION = " ' ’ ".strip().replace(" ", "") +ELISION = "'’" -_infixes = TOKENIZER_INFIXES + [ - r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION) -] +_prefixes = ( + [ + r"'[0-9][0-9]", + r"[0-9]+°", + ] + + TOKENIZER_PREFIXES +) + + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])(?:{h})(?=[{al}])".format(a=ALPHA, h=HYPHENS, al=ALPHA_LOWER), + r"(?<=[{a}0-9])[:<>=\/](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION) + ] +) + +TOKENIZER_PREFIXES = _prefixes TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/it/tokenizer_exceptions.py b/spacy/lang/it/tokenizer_exceptions.py index 62f568c5c..70dfe92bd 100644 --- a/spacy/lang/it/tokenizer_exceptions.py +++ b/spacy/lang/it/tokenizer_exceptions.py @@ -2,6 +2,56 @@ from __future__ import unicode_literals from ...symbols import ORTH, LEMMA -_exc = {"po'": [{ORTH: "po'", LEMMA: "poco"}]} +_exc = { + "all'art.": [{ORTH: "all'"}, {ORTH: "art."}], + "dall'art.": [{ORTH: "dall'"}, {ORTH: "art."}], + "dell'art.": [{ORTH: "dell'"}, {ORTH: "art."}], + "L'art.": [{ORTH: "L'"}, {ORTH: "art."}], + "l'art.": [{ORTH: "l'"}, {ORTH: "art."}], + "nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}], + "po'": [{ORTH: "po'", LEMMA: "poco"}], + "sett..": [{ORTH: "sett."}, {ORTH: "."}] +} + +for orth in [ + "..", + "....", + "al.", + "all-path", + "art.", + "Art.", + "artt.", + "att.", + "by-pass", + "c.d.", + "centro-sinistra", + "check-up", + "Civ.", + "cm.", + "Cod.", + "col.", + "Cost.", + "d.C.", + 'de"' + "distr.", + "E'", + "ecc.", + "e-mail", + "e/o", + "etc.", + "Jr.", + "n°", + "nord-est", + "pag.", + "Proc.", + "prof.", + "sett.", + "s.p.a.", + "ss.", + "St.", + "tel.", + "week-end", +]: + _exc[orth] = [{ORTH: orth}] TOKENIZER_EXCEPTIONS = _exc