From 8a469f06a4ab45008fbac78afcdd8c612bbdad99 Mon Sep 17 00:00:00 2001
From: Jean Maillard <jean@maillard.it>
Date: Sun, 24 Nov 2024 18:18:37 -0800
Subject: [PATCH] Improve Ligurian tokenization

---
 spacy/lang/lij/__init__.py                    |  3 +-
 spacy/lang/lij/examples.py                    |  2 +-
 spacy/lang/lij/punctuation.py                 | 22 ++++-
 spacy/lang/lij/stop_words.py                  | 30 +++---
 spacy/lang/lij/tokenizer_exceptions.py        | 98 +++++++++++--------
 spacy/tests/conftest.py                       |  5 +
 spacy/tests/lang/lij/__init__.py              |  0
 spacy/tests/lang/lij/test_exceptions.py       | 13 +++
 .../lang/lij/test_prefix_suffix_infix.py      | 24 +++++
 9 files changed, 136 insertions(+), 61 deletions(-)
 create mode 100644 spacy/tests/lang/lij/__init__.py
 create mode 100644 spacy/tests/lang/lij/test_exceptions.py
 create mode 100644 spacy/tests/lang/lij/test_prefix_suffix_infix.py

diff --git a/spacy/lang/lij/__init__.py b/spacy/lang/lij/__init__.py
index 3b8e972c6..bfd370e11 100644
--- a/spacy/lang/lij/__init__.py
+++ b/spacy/lang/lij/__init__.py
@@ -1,5 +1,5 @@
 from ...language import BaseDefaults, Language
-from .punctuation import TOKENIZER_INFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES
 from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
@@ -7,6 +7,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 class LigurianDefaults(BaseDefaults):
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     infixes = TOKENIZER_INFIXES
+    prefixes = TOKENIZER_PREFIXES
     stop_words = STOP_WORDS
 
 
diff --git a/spacy/lang/lij/examples.py b/spacy/lang/lij/examples.py
index ba7fe43fd..7b6186e1b 100644
--- a/spacy/lang/lij/examples.py
+++ b/spacy/lang/lij/examples.py
@@ -9,6 +9,6 @@ Example sentences to test spaCy and its language models.
 sentences = [
     "Sciusciâ e sciorbî no se peu.",
     "Graçie di çetroin, che me son arrivæ.",
-    "Vegnime apreuvo, che ve fasso pescâ di òmmi.",
+    "Vegnîme apreuvo, che ve fasso pescâ di òmmi.",
     "Bella pe sempre l'ægua inta conchetta quande unn'agoggia d'ægua a se â trapaña.",
 ]
diff --git a/spacy/lang/lij/punctuation.py b/spacy/lang/lij/punctuation.py
index c5c150d0a..5145856b4 100644
--- a/spacy/lang/lij/punctuation.py
+++ b/spacy/lang/lij/punctuation.py
@@ -1,11 +1,23 @@
+from ..punctuation import (
+    TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES,
+    TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES,
+)
 from ..char_classes import ALPHA
-from ..punctuation import TOKENIZER_INFIXES
-
-ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
 
 
-_infixes = TOKENIZER_INFIXES + [
-    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
+ELISION = "'’"
+
+
+_prefixes = [
+    r"['’‘][0-9]{2}",  # shorthand for years
+    r"[0-9]+°(?![cfkCFK])",  # use of degree symbol as ordinal indicator
+    r"[{el}‘]nn?[{el}]?".format(el=ELISION),  # elided forms of "un(na)"
+] + BASE_TOKENIZER_PREFIXES
+
+
+_infixes = BASE_TOKENIZER_INFIXES + [
+    r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION),
 ]
 
+TOKENIZER_PREFIXES = _prefixes
 TOKENIZER_INFIXES = _infixes
diff --git a/spacy/lang/lij/stop_words.py b/spacy/lang/lij/stop_words.py
index 1d6f09d27..e7985608e 100644
--- a/spacy/lang/lij/stop_words.py
+++ b/spacy/lang/lij/stop_words.py
@@ -1,38 +1,40 @@
 STOP_WORDS = set(
     """
-a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuvo ascì atra atre atri atro avanti avei
+a à â a-a a-e a-i a-o aiva aloa an ancheu ancon apreuo apreuvo ascì atra atre atri atro avanti avei aveiva
 
-bella belle belli bello ben
+bell' bell’ bella belle belli bello ben
 
-ch' che chì chi ciù co-a co-e co-i co-o comm' comme con cösa coscì cöse
+ch' ch’ che chì chi ciù co-a co-e co-i co-o comm' comm’ comme con contr' contr’ contra cösa coscì cöse
 
-d' da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo
+d' d’ da da-a da-e da-i da-o dapeu de delongo derê di do doe doî donde dòppo drent' drent’ dento
 
-é e ê ea ean emmo en ëse
+é à e ê ea ean emmo en ëse
 
 fin fiña
 
-gh' ghe guæei
+gh' gh’ ghe guæi
 
-i î in insemme int' inta inte inti into
+i î in insemme int' int’ inta inte inti into
 
-l' lê lì lô
+l' l’ lê lì liatre liatri lô loiatre loiatri
 
-m' ma manco me megio meno mezo mi
+m' m’ ma mai manco me megio meno meza meze mezi mezo mi
 
-na n' ne ni ninte nisciun nisciuña no
+n' n’ na ne nì niatre niatri ninte nisciun nisciuña no noiatre noiatri
 
 o ò ô oua
 
 parte pe pe-a pe-i pe-e pe-o perché pittin pö primma pròpio
 
-quæ quand' quande quarche quella quelle quelli quello
+quæ quand' quand’ quande quarche quarcösa quell' quell’ quella quelle quelli quello
 
-s' sce scê sci sciâ sciô sciù se segge seu sò solo son sott' sta stæta stæte stæti stæto ste sti sto
+s' s’ sce scê scì scî scià sciâ sciô sciù se segge seu sò solo son sott' sott’ sotta sta stæta stæte stæti stæto ste sti sto
 
-tanta tante tanti tanto te ti torna tra tròppo tutta tutte tutti tutto
+tant' tant’ tanta tante tanti tanto te teu tò ti torna tra tròppo tutt' tutt’ tutta tutte tutti tutto
 
-un uña unn' unna
+un uña unn' unn’ unna
+
+voî voscià
 
 za zu
 """.split()
diff --git a/spacy/lang/lij/tokenizer_exceptions.py b/spacy/lang/lij/tokenizer_exceptions.py
index cf5a1af66..47e54a49c 100644
--- a/spacy/lang/lij/tokenizer_exceptions.py
+++ b/spacy/lang/lij/tokenizer_exceptions.py
@@ -1,49 +1,67 @@
-from ...symbols import ORTH
+from ...symbols import ORTH, NORM
 from ...util import update_exc
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 
+
+# Returns capitalized variants, all caps variants and with curly apostrophe
+def _variants(orth, exc):
+    yield orth, exc
+    yield orth.capitalize(), [
+        {ORTH: e[ORTH].capitalize() if i == 0 else e[ORTH], NORM: e.get(NORM, e[ORTH])}
+        for i, e in enumerate(exc)
+    ]
+    yield orth.upper(), [
+        {ORTH: e[ORTH].upper(), NORM: e.get(NORM, e[ORTH])} for e in exc
+    ]
+    if "'" in orth:
+        yield from _variants(
+            orth.replace("'", "’"),
+            [
+                {ORTH: e[ORTH].replace("'", "’"), NORM: e.get(NORM, e[ORTH])}
+                for e in exc
+            ],
+        )
+
+
 _exc = {}
 
-for raw in [
-    "a-e",
-    "a-o",
-    "a-i",
-    "a-a",
-    "co-a",
-    "co-e",
-    "co-i",
-    "co-o",
-    "da-a",
-    "da-e",
-    "da-i",
-    "da-o",
-    "pe-a",
-    "pe-e",
-    "pe-i",
-    "pe-o",
-]:
-    for orth in [raw, raw.capitalize()]:
-        _exc[orth] = [{ORTH: orth}]
+# Compound prepositions
 
-# Prefix + prepositions with à (e.g. "sott'a-o")
+# Compounds with "inte" and "de" aren't split as they can be ambiguous
+# Format: (compound form, isolated form, determiners it goes with)
+_preps = [
+    ("a-", "à", "oaie"),
+    ("co-", "con", "oaie"),
+    ("da-", "da", "oaie"),
+    ("pe-", "pe", "oaie"),
+    ("pi-", "pe", "a"),  # colloquialism
+    ("de-", "de", "oaie"),  # incorrect, but occasionally seen
+    ("ne-", "inte", "oaie"),  # incorrect, but occasionally seen
+]
+for prep_, prep, dets in _preps:
+    for det in dets:
+        for orth, exc in _variants(
+            prep_ + det, [{ORTH: prep_, NORM: prep}, {ORTH: det}]
+        ):
+            _exc[orth] = exc
 
-for prep in [
-    "a-a",
-    "a-e",
-    "a-o",
-    "a-i",
-]:
-    for prefix in [
-        "sott'",
-        "sott’",
-        "contr'",
-        "contr’",
-        "ch'",
-        "ch’",
-        "s'",
-        "s’",
-    ]:
-        for prefix_orth in [prefix, prefix.capitalize()]:
-            _exc[prefix_orth + prep] = [{ORTH: prefix_orth}, {ORTH: prep}]
+# Units
+
+for u in "cfkCFK":
+    _exc[f"°{u}"] = [{ORTH: f"°{u}"}]
+    _exc[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}]
+
+# Other exceptions
+
+_other_exc = {
+    "'n'": [{ORTH: "'n'", NORM: "unna"}],
+    "‘n'": [{ORTH: "‘n'", NORM: "unna"}],
+    "'n": [{ORTH: "'n", NORM: "un"}],
+    "‘n": [{ORTH: "‘n", NORM: "un"}],
+    "tou": [{ORTH: "t", NORM: "te"}, {ORTH: "ou", NORM: "ô"}],
+}
+for orth_, exc_ in _other_exc.items():
+    for orth, exc in _variants(orth_, exc_):
+        _exc[orth] = exc
 
 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index e30300a33..379f65a7c 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -282,6 +282,11 @@ def lg_tokenizer():
     return get_lang_class("lg")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def lij_tokenizer():
+    return get_lang_class("lij")().tokenizer
+
+
 @pytest.fixture(scope="session")
 def lt_tokenizer():
     return get_lang_class("lt")().tokenizer
diff --git a/spacy/tests/lang/lij/__init__.py b/spacy/tests/lang/lij/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/lij/test_exceptions.py b/spacy/tests/lang/lij/test_exceptions.py
new file mode 100644
index 000000000..877a86913
--- /dev/null
+++ b/spacy/tests/lang/lij/test_exceptions.py
@@ -0,0 +1,13 @@
+import pytest
+
+
+@pytest.mark.parametrize(
+    "text,expected_tokens,expected_norms",
+    [("a-e", ["a-", "e"], ["à", "e"]), ("co-i", ["co-", "i"], ["con", "i"])],
+)
+def test_prepositions(lij_tokenizer, text, expected_tokens, expected_norms):
+    """Test that compound prepositions are split correctly."""
+    tokens = lij_tokenizer(text)
+    assert len(tokens) == 2
+    assert [t.text for t in tokens] == expected_tokens
+    assert [t.norm_ for t in tokens] == expected_norms
diff --git a/spacy/tests/lang/lij/test_prefix_suffix_infix.py b/spacy/tests/lang/lij/test_prefix_suffix_infix.py
new file mode 100644
index 000000000..7914bed8e
--- /dev/null
+++ b/spacy/tests/lang/lij/test_prefix_suffix_infix.py
@@ -0,0 +1,24 @@
+import pytest
+
+
+@pytest.mark.parametrize("text", ["'90", "’90", "‘90"])
+def test_lij_tokenizer_handles_year_elision(lij_tokenizer, text):
+    """Test that elided years (e.g. '90 for 1990) are not split."""
+    tokens = lij_tokenizer(text)
+    assert len(tokens) == 1
+
+
+@pytest.mark.parametrize("text,expected_tokens", [("10°C", ["10", "°C"])])
+def test_lij_tokenizer_handles_degrees(lij_tokenizer, text, expected_tokens):
+    """Test that in degree units the degree symbol isn't split from the unit."""
+    tokens = lij_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list
+
+
+@pytest.mark.parametrize("text,expected_tokens", [("'n'atra", ["'n'", "atra"])])
+def test_lij_tokenizer_handles_left_elision(lij_tokenizer, text, expected_tokens):
+    """Test that left-eliding expressions are not split from their left apostrophe."""
+    tokens = lij_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list