mirror of https://github.com/explosion/spaCy.git
53 lines
1.2 KiB
Python
53 lines
1.2 KiB
Python
|
# coding: utf8
|
|||
|
from __future__ import unicode_literals
|
|||
|
from ...symbols import ORTH, LEMMA
|
|||
|
|
|||
|
_exc = {}
|
|||
|
|
|||
|
for raw, lemma in [
|
|||
|
("a-a", "a-o"),
|
|||
|
("a-e", "a-o"),
|
|||
|
("a-o", "a-o"),
|
|||
|
("a-i", "a-o"),
|
|||
|
("co-a", "co-o"),
|
|||
|
("co-e", "co-o"),
|
|||
|
("co-i", "co-o"),
|
|||
|
("co-o", "co-o"),
|
|||
|
("da-a", "da-o"),
|
|||
|
("da-e", "da-o"),
|
|||
|
("da-i", "da-o"),
|
|||
|
("da-o", "da-o"),
|
|||
|
("pe-a", "pe-o"),
|
|||
|
("pe-e", "pe-o"),
|
|||
|
("pe-i", "pe-o"),
|
|||
|
("pe-o", "pe-o"),
|
|||
|
]:
|
|||
|
for orth in [raw, raw.capitalize()]:
|
|||
|
_exc[orth] = [{ORTH: orth, LEMMA: lemma}]
|
|||
|
|
|||
|
# Prefix + prepositions with à (e.g. "sott'a-o")
|
|||
|
|
|||
|
for prep, prep_lemma in [
|
|||
|
("a-a", "a-o"),
|
|||
|
("a-e", "a-o"),
|
|||
|
("a-o", "a-o"),
|
|||
|
("a-i", "a-o"),
|
|||
|
]:
|
|||
|
for prefix, prefix_lemma in [
|
|||
|
("sott'", "sotta"),
|
|||
|
("sott’", "sotta"),
|
|||
|
("contr'", "contra"),
|
|||
|
("contr’", "contra"),
|
|||
|
("ch'", "che"),
|
|||
|
("ch’", "che"),
|
|||
|
("s'", "se"),
|
|||
|
("s’", "se"),
|
|||
|
]:
|
|||
|
for prefix_orth in [prefix, prefix.capitalize()]:
|
|||
|
_exc[prefix_orth+prep] = [
|
|||
|
{ORTH: prefix_orth, LEMMA: prefix_lemma},
|
|||
|
{ORTH: prep, LEMMA: prep_lemma},
|
|||
|
]
|
|||
|
|
|||
|
TOKENIZER_EXCEPTIONS = _exc
|