diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 1d204c46c..37c58c85f 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -280,7 +280,7 @@ _currency = ( _punct = ( r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ۔ ؛ ٪" ) -_quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉' +_quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉 〈 〉 ⟦ ⟧' _hyphens = "- – — -- --- —— ~" # Various symbols like dingbats, but also emoji diff --git a/spacy/lang/grc/__init__.py b/spacy/lang/grc/__init__.py index e83f0c5a5..019b3802e 100644 --- a/spacy/lang/grc/__init__.py +++ b/spacy/lang/grc/__init__.py @@ -1,11 +1,15 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from ...language import Language, BaseDefaults class AncientGreekDefaults(BaseDefaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS + prefixes = TOKENIZER_PREFIXES + suffixes = TOKENIZER_SUFFIXES + infixes = TOKENIZER_INFIXES lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/grc/punctuation.py b/spacy/lang/grc/punctuation.py new file mode 100644 index 000000000..8f3589e9a --- /dev/null +++ b/spacy/lang/grc/punctuation.py @@ -0,0 +1,46 @@ +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY +from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS +from ..char_classes import CONCAT_QUOTES + +_prefixes = ( + [ + "†", + "⸏", + ] + + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_CURRENCY + + LIST_ICONS +) + +_suffixes = ( + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_ICONS + + [ + "†", + "⸎", + r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])[\-\.⸏]", + ] +) + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])—", + ] +) + +TOKENIZER_PREFIXES = _prefixes +TOKENIZER_SUFFIXES = _suffixes +TOKENIZER_INFIXES = _infixes diff --git a/spacy/tests/lang/grc/test_tokenizer.py b/spacy/tests/lang/grc/test_tokenizer.py new file mode 100644 index 000000000..3df5b546b --- /dev/null +++ b/spacy/tests/lang/grc/test_tokenizer.py @@ -0,0 +1,18 @@ +import pytest + + +# fmt: off +GRC_TOKEN_EXCEPTION_TESTS = [ + ("τὸ 〈τῆς〉 φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ ⟦βαρβάρων⟧ ἄρξαι.", ["τὸ", "〈", "τῆς", "〉", "φιλοσοφίας", "ἔργον", "ἔνιοί", "φασιν", "ἀπὸ", "⟦", "βαρβάρων", "⟧", "ἄρξαι", "."]), + ("τὴν δὲ τῶν Αἰγυπτίων φιλοσοφίαν εἶναι τοιαύτην περί τε †θεῶν† καὶ ὑπὲρ δικαιοσύνης.", ["τὴν", "δὲ", "τῶν", "Αἰγυπτίων", "φιλοσοφίαν", "εἶναι", "τοιαύτην", "περί", "τε", "†", "θεῶν", "†", "καὶ", "ὑπὲρ", "δικαιοσύνης", "."]), + ("⸏πόσις δ' Ἐρεχθεύς ἐστί μοι σεσωσμένος⸏", ["⸏", "πόσις", "δ'", "Ἐρεχθεύς", "ἐστί", "μοι", "σεσωσμένος", "⸏"]), + ("⸏ὔπνον ἴδωμεν⸎", ["⸏", "ὔπνον", "ἴδωμεν", "⸎"]), +] +# fmt: on + + +@pytest.mark.parametrize("text,expected_tokens", GRC_TOKEN_EXCEPTION_TESTS) +def test_grc_tokenizer(grc_tokenizer, text, expected_tokens): + tokens = grc_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list