From 2051726fd32c823aec8649394e0033ec01c69894 Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Sat, 10 Dec 2016 23:29:41 +0100 Subject: [PATCH] Passing Hungatian abbrev tests. --- spacy/hu/language_data.py | 33 +-------- spacy/tests/hu/test_tokenizer.py | 27 ------- spacy/tests/hu/tokenizer/__init__.py | 4 + .../hu/tokenizer/test_default_token_dots.txt | 58 +++++++++++++++ spacy/tests/hu/tokenizer/test_tokenizer.py | 74 +++++++++++++++++++ 5 files changed, 139 insertions(+), 57 deletions(-) delete mode 100644 spacy/tests/hu/test_tokenizer.py create mode 100644 spacy/tests/hu/tokenizer/__init__.py create mode 100644 spacy/tests/hu/tokenizer/test_default_token_dots.txt create mode 100644 spacy/tests/hu/tokenizer/test_tokenizer.py diff --git a/spacy/hu/language_data.py b/spacy/hu/language_data.py index 6ee193d41..138b3afc8 100644 --- a/spacy/hu/language_data.py +++ b/spacy/hu/language_data.py @@ -24,34 +24,7 @@ STOP_WORDS = set(_load_txt_data(_STOPWORDS_PATH)) HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]] -TOKENIZER_PREFIXES = map(re.escape, r''' -, -" -( -[ -{ -* -< -> -$ -£ -„ -“ -' -`` -` -# -US$ -C$ -A$ -‘ -.... -... -‚ -» -_ -§ -'''.strip().split('\n')) +TOKENIZER_PREFIXES = r''', " ( [ { * < $ £ “ ' `` ` # US$ C$ A$ a- ‘ .... ...'''.split() TOKENIZER_SUFFIXES = r''' , @@ -125,11 +98,11 @@ _ (?<=[0-9])kb '''.strip().split('\n') -TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) ''' +TOKENIZER_INFIXES = (r'''\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) ''' r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) ''' r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split() -ABBREVIATIONS = {abbrev: [{"F": abbrev}] for abbrev in +ABBREVIATIONS = {abbrev: [{"ORTH": abbrev}] for abbrev in _load_txt_data(_ABBREVIATIONS_ORIG_PATH, _ABBREVIATIONS_NYTUD_PATH)} TOKENIZER_EXCEPTIONS = { diff --git a/spacy/tests/hu/test_tokenizer.py b/spacy/tests/hu/test_tokenizer.py deleted file mode 100644 index 4cbf1757d..000000000 --- a/spacy/tests/hu/test_tokenizer.py +++ /dev/null @@ -1,27 +0,0 @@ -import pytest - -from spacy.hu import Hungarian - - -@pytest.fixture(scope="session") -def HU(): - return Hungarian() - - -@pytest.fixture(scope="module") -def hu_tokenizer(HU): - return HU.tokenizer - - -@pytest.mark.parametrize(("input_str", "expected_length"), [ - ("A vs. egy", 3), - ("A dr. egy", 3), - ("A .hu egy tld.", 5), - ("A .hu.", 3), - ("Az egy.ketto pelda.", 4), - ("A pl. rovidites.", 4), - ("A S.M.A.R.T. szo.", 4) -]) -def test_abbreviations(hu_tokenizer, input_str, expected_length): - tokens = hu_tokenizer(input_str) - assert len(tokens) == expected_length diff --git a/spacy/tests/hu/tokenizer/__init__.py b/spacy/tests/hu/tokenizer/__init__.py new file mode 100644 index 000000000..818b62e48 --- /dev/null +++ b/spacy/tests/hu/tokenizer/__init__.py @@ -0,0 +1,4 @@ +__author__ = 'gyorgyorosz' + +if __name__ == "__main__": + pass diff --git a/spacy/tests/hu/tokenizer/test_default_token_dots.txt b/spacy/tests/hu/tokenizer/test_default_token_dots.txt new file mode 100644 index 000000000..0e9ad3c65 --- /dev/null +++ b/spacy/tests/hu/tokenizer/test_default_token_dots.txt @@ -0,0 +1,58 @@ +# TOKEN dots + +0. egyszeru szavak +IN : N. kormányzósági +IN : székhely. +OUT: N. kormányzósági +OUT: székhely. + + +1. szavak pontokkal + +1.1 mondatkozi verziok +1.1.1 pottal kezdodo szavak +IN : A .hu egy tld. +OUT: A .hu egy tld. +1.1.2 pont a szo belsejeben +IN : Az egy.ketto pelda. +OUT: Az egy.ketto pelda. +1.1.3 pont a szo vegen +IN : A pl. rovidites. +OUT: A pl. rovidites. +1.1.4 pontozott szo +IN : A S.M.A.R.T. szo. +OUT: A S.M.A.R.T. szo. + +1.2 mondatvegi verziok +1.2.1 pottal kezdodo szavak +IN : A .hu. +OUT: A .hu. +1.2.2 pont a szo belsejeben +IN : Az egy.ketto. +OUT: Az egy.ketto. +1.2.3 pont a szo vegen +#TODO: cf. Huntoken +IN : A pl. +OUT: A pl. +1.2.4 pontozott szo +#TODO: cf. Huntoken +IN : A S.M.A.R.T. +OUT: A S.M.A.R.T. + + +2. tobb pont + +2.1 ketto vagy tobb pont utan uj szo +IN : Egy..ket. +OUT: Egy..ket. +IN : Valami... van. +OUT: Valami... van. +IN : Valami ...van... +OUT: Valami ...van... +IN : Valami... +OUT: Valami... +IN : Valami ... +OUT: Valami ... +IN : Valami ... más. +OUT: Valami ... más. + diff --git a/spacy/tests/hu/tokenizer/test_tokenizer.py b/spacy/tests/hu/tokenizer/test_tokenizer.py new file mode 100644 index 000000000..12dbe5b78 --- /dev/null +++ b/spacy/tests/hu/tokenizer/test_tokenizer.py @@ -0,0 +1,74 @@ +import os +import re + +import pytest + +from spacy.hu import Hungarian + +_MODULE_PATH = os.path.dirname(__file__) + + +class TokenizerTestCase(object): + INPUT_PREFIX = "IN :" + OUTPUT_PREFIX = "OUT:" + WORD_PATTERN = re.compile(r"<([wc])>([^<>]+)") + + def __init__(self, input_str, expected_words): + self.input = input_str + self.expected_tokens = expected_words + + def __repr__(self): + return "TokenizerTestCase".format(repr(self.input), self.expected_tokens) + + def to_tuple(self): + return (self.input, self.expected_tokens) + + @classmethod + def _parse_output_line(cls, line): + for match in cls.WORD_PATTERN.finditer(line): + yield match.group(2) + + @classmethod + def read_from_file(cls, path): + with open(path) as f: + input_lines = [] + output_words = [] + last_type = None + for line in f: + if line.startswith(cls.INPUT_PREFIX): + if last_type == TokenizerTestCase.OUTPUT_PREFIX and input_lines: + yield TokenizerTestCase("\n".join(input_lines), output_words) + input_lines = [] + output_words = [] + input_lines.append(line[len(cls.INPUT_PREFIX):].strip()) + last_type = TokenizerTestCase.INPUT_PREFIX + elif line.startswith(cls.OUTPUT_PREFIX): + output_words.extend(list(cls._parse_output_line(line.strip()))) + last_type = TokenizerTestCase.OUTPUT_PREFIX + else: + # Comments separate test cases + if input_lines: + yield TokenizerTestCase("\n".join(input_lines), output_words) + input_lines = [] + output_words = [] + last_type = None + + +_DOTS_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_dots.txt")) + + +@pytest.fixture(scope="session") +def HU(): + return Hungarian() + + +@pytest.fixture(scope="module") +def hu_tokenizer(HU): + return HU.tokenizer + + +@pytest.mark.parametrize(("test_case"), _DOTS_CASES) +def test_abbreviations(hu_tokenizer, test_case): + tokens = hu_tokenizer(test_case.input) + token_list = [token.orth_ for token in tokens if not token.is_space] + assert test_case.expected_tokens == token_list, "{} was erronously tokenized as {}".format(test_case, token_list)