From 8a469f06a4ab45008fbac78afcdd8c612bbdad99 Mon Sep 17 00:00:00 2001 From: Jean Maillard Date: Sun, 24 Nov 2024 18:18:37 -0800 Subject: [PATCH] Improve Ligurian tokenization --- spacy/lang/lij/__init__.py | 3 +- spacy/lang/lij/examples.py | 2 +- spacy/lang/lij/punctuation.py | 22 ++++- spacy/lang/lij/stop_words.py | 30 +++--- spacy/lang/lij/tokenizer_exceptions.py | 98 +++++++++++-------- spacy/tests/conftest.py | 5 + spacy/tests/lang/lij/__init__.py | 0 spacy/tests/lang/lij/test_exceptions.py | 13 +++ .../lang/lij/test_prefix_suffix_infix.py | 24 +++++ 9 files changed, 136 insertions(+), 61 deletions(-) create mode 100644 spacy/tests/lang/lij/__init__.py create mode 100644 spacy/tests/lang/lij/test_exceptions.py create mode 100644 spacy/tests/lang/lij/test_prefix_suffix_infix.py diff --git a/spacy/lang/lij/__init__.py b/spacy/lang/lij/__init__.py index 3b8e972c6..bfd370e11 100644 --- a/spacy/lang/lij/__init__.py +++ b/spacy/lang/lij/__init__.py @@ -1,5 +1,5 @@ from ...language import BaseDefaults, Language -from .punctuation import TOKENIZER_INFIXES +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS @@ -7,6 +7,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS class LigurianDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES + prefixes = TOKENIZER_PREFIXES stop_words = STOP_WORDS diff --git a/spacy/lang/lij/examples.py b/spacy/lang/lij/examples.py index ba7fe43fd..7b6186e1b 100644 --- a/spacy/lang/lij/examples.py +++ b/spacy/lang/lij/examples.py @@ -9,6 +9,6 @@ Example sentences to test spaCy and its language models. sentences = [ "Sciusciâ e sciorbî no se peu.", "Graçie di çetroin, che me son arrivæ.", - "Vegnime apreuvo, che ve fasso pescâ di òmmi.", + "Vegnîme apreuvo, che ve fasso pescâ di òmmi.", "Bella pe sempre l'ægua inta conchetta quande unn'agoggia d'ægua a se â trapaña.", ] diff --git a/spacy/lang/lij/punctuation.py b/spacy/lang/lij/punctuation.py index c5c150d0a..5145856b4 100644 --- a/spacy/lang/lij/punctuation.py +++ b/spacy/lang/lij/punctuation.py @@ -1,11 +1,23 @@ +from ..punctuation import ( + TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES, + TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES, +) from ..char_classes import ALPHA -from ..punctuation import TOKENIZER_INFIXES - -ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") -_infixes = TOKENIZER_INFIXES + [ - r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION) +ELISION = "'’" + + +_prefixes = [ + r"['’‘][0-9]{2}", # shorthand for years + r"[0-9]+°(?![cfkCFK])", # use of degree symbol as ordinal indicator + r"[{el}‘]nn?[{el}]?".format(el=ELISION), # elided forms of "un(na)" +] + BASE_TOKENIZER_PREFIXES + + +_infixes = BASE_TOKENIZER_INFIXES + [ + r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION), ] +TOKENIZER_PREFIXES = _prefixes TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/lij/stop_words.py b/spacy/lang/lij/stop_words.py index 1d6f09d27..e7985608e 100644 --- a/spacy/lang/lij/stop_words.py +++ b/spacy/lang/lij/stop_words.py @@ -1,38 +1,40 @@ STOP_WORDS = set( """ -a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri atro avanti avei +a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuo apreuvo ascì atra atre atri atro avanti avei aveiva -bella belle belli bello ben +bell' bell’ bella belle belli bello ben -ch' che chì chi ciù co-a co-e co-i co-o comm' comme con cösa coscì cöse +ch' ch’ che chì chi ciù co-a co-e co-i co-o comm' comm’ comme con contr' contr’ contra cösa coscì cöse -d' da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo +d' d’ da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo drent' drent’ dento -é e ê ea ean emmo en ëse +é à e ê ea ean emmo en ëse fin fiña -gh' ghe guæei +gh' gh’ ghe guæi -i î in insemme int' inta inte inti into +i î in insemme int' int’ inta inte inti into -l' lê lì lô +l' l’ lê lì liatre liatri lô loiatre loiatri -m' ma manco me megio meno mezo mi +m' m’ ma mai manco me megio meno meza meze mezi mezo mi -na n' ne ni ninte nisciun nisciuña no +n' n’ na ne nì niatre niatri ninte nisciun nisciuña no noiatre noiatri o ò ô oua parte pe pe-a pe-i pe-e pe-o perché pittin pö primma pròpio -quæ quand' quande quarche quella quelle quelli quello +quæ quand' quand’ quande quarche quarcösa quell' quell’ quella quelle quelli quello -s' sce scê sci sciâ sciô sciù se segge seu sò solo son sott' sta stæta stæte stæti stæto ste sti sto +s' s’ sce scê scì scî scià sciâ sciô sciù se segge seu sò solo son sott' sott’ sotta sta stæta stæte stæti stæto ste sti sto -tanta tante tanti tanto te ti torna tra tròppo tutta tutte tutti tutto +tant' tant’ tanta tante tanti tanto te teu tò ti torna tra tròppo tutt' tutt’ tutta tutte tutti tutto -un uña unn' unna +un uña unn' unn’ unna + +voî voscià za zu """.split() diff --git a/spacy/lang/lij/tokenizer_exceptions.py b/spacy/lang/lij/tokenizer_exceptions.py index cf5a1af66..47e54a49c 100644 --- a/spacy/lang/lij/tokenizer_exceptions.py +++ b/spacy/lang/lij/tokenizer_exceptions.py @@ -1,49 +1,67 @@ -from ...symbols import ORTH +from ...symbols import ORTH, NORM from ...util import update_exc from ..tokenizer_exceptions import BASE_EXCEPTIONS + +# Returns capitalized variants, all caps variants and with curly apostrophe +def _variants(orth, exc): + yield orth, exc + yield orth.capitalize(), [ + {ORTH: e[ORTH].capitalize() if i == 0 else e[ORTH], NORM: e.get(NORM, e[ORTH])} + for i, e in enumerate(exc) + ] + yield orth.upper(), [ + {ORTH: e[ORTH].upper(), NORM: e.get(NORM, e[ORTH])} for e in exc + ] + if "'" in orth: + yield from _variants( + orth.replace("'", "’"), + [ + {ORTH: e[ORTH].replace("'", "’"), NORM: e.get(NORM, e[ORTH])} + for e in exc + ], + ) + + _exc = {} -for raw in [ - "a-e", - "a-o", - "a-i", - "a-a", - "co-a", - "co-e", - "co-i", - "co-o", - "da-a", - "da-e", - "da-i", - "da-o", - "pe-a", - "pe-e", - "pe-i", - "pe-o", -]: - for orth in [raw, raw.capitalize()]: - _exc[orth] = [{ORTH: orth}] +# Compound prepositions -# Prefix + prepositions with à (e.g. "sott'a-o") +# Compounds with "inte" and "de" aren't split as they can be ambiguous +# Format: (compound form, isolated form, determiners it goes with) +_preps = [ + ("a-", "à", "oaie"), + ("co-", "con", "oaie"), + ("da-", "da", "oaie"), + ("pe-", "pe", "oaie"), + ("pi-", "pe", "a"), # colloquialism + ("de-", "de", "oaie"), # incorrect, but occasionally seen + ("ne-", "inte", "oaie"), # incorrect, but occasionally seen +] +for prep_, prep, dets in _preps: + for det in dets: + for orth, exc in _variants( + prep_ + det, [{ORTH: prep_, NORM: prep}, {ORTH: det}] + ): + _exc[orth] = exc -for prep in [ - "a-a", - "a-e", - "a-o", - "a-i", -]: - for prefix in [ - "sott'", - "sott’", - "contr'", - "contr’", - "ch'", - "ch’", - "s'", - "s’", - ]: - for prefix_orth in [prefix, prefix.capitalize()]: - _exc[prefix_orth + prep] = [{ORTH: prefix_orth}, {ORTH: prep}] +# Units + +for u in "cfkCFK": + _exc[f"°{u}"] = [{ORTH: f"°{u}"}] + _exc[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}] + +# Other exceptions + +_other_exc = { + "'n'": [{ORTH: "'n'", NORM: "unna"}], + "‘n'": [{ORTH: "‘n'", NORM: "unna"}], + "'n": [{ORTH: "'n", NORM: "un"}], + "‘n": [{ORTH: "‘n", NORM: "un"}], + "tou": [{ORTH: "t", NORM: "te"}, {ORTH: "ou", NORM: "ô"}], +} +for orth_, exc_ in _other_exc.items(): + for orth, exc in _variants(orth_, exc_): + _exc[orth] = exc TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index e30300a33..379f65a7c 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -282,6 +282,11 @@ def lg_tokenizer(): return get_lang_class("lg")().tokenizer +@pytest.fixture(scope="session") +def lij_tokenizer(): + return get_lang_class("lij")().tokenizer + + @pytest.fixture(scope="session") def lt_tokenizer(): return get_lang_class("lt")().tokenizer diff --git a/spacy/tests/lang/lij/__init__.py b/spacy/tests/lang/lij/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/lij/test_exceptions.py b/spacy/tests/lang/lij/test_exceptions.py new file mode 100644 index 000000000..877a86913 --- /dev/null +++ b/spacy/tests/lang/lij/test_exceptions.py @@ -0,0 +1,13 @@ +import pytest + + +@pytest.mark.parametrize( + "text,expected_tokens,expected_norms", + [("a-e", ["a-", "e"], ["à", "e"]), ("co-i", ["co-", "i"], ["con", "i"])], +) +def test_prepositions(lij_tokenizer, text, expected_tokens, expected_norms): + """Test that compound prepositions are split correctly.""" + tokens = lij_tokenizer(text) + assert len(tokens) == 2 + assert [t.text for t in tokens] == expected_tokens + assert [t.norm_ for t in tokens] == expected_norms diff --git a/spacy/tests/lang/lij/test_prefix_suffix_infix.py b/spacy/tests/lang/lij/test_prefix_suffix_infix.py new file mode 100644 index 000000000..7914bed8e --- /dev/null +++ b/spacy/tests/lang/lij/test_prefix_suffix_infix.py @@ -0,0 +1,24 @@ +import pytest + + +@pytest.mark.parametrize("text", ["'90", "’90", "‘90"]) +def test_lij_tokenizer_handles_year_elision(lij_tokenizer, text): + """Test that elided years (e.g. '90 for 1990) are not split.""" + tokens = lij_tokenizer(text) + assert len(tokens) == 1 + + +@pytest.mark.parametrize("text,expected_tokens", [("10°C", ["10", "°C"])]) +def test_lij_tokenizer_handles_degrees(lij_tokenizer, text, expected_tokens): + """Test that in degree units the degree symbol isn't split from the unit.""" + tokens = lij_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list + + +@pytest.mark.parametrize("text,expected_tokens", [("'n'atra", ["'n'", "atra"])]) +def test_lij_tokenizer_handles_left_elision(lij_tokenizer, text, expected_tokens): + """Test that left-eliding expressions are not split from their left apostrophe.""" + tokens = lij_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list