From 56eabcb2f2dd732e1c440468817a99350caf3e51 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 21 Aug 2020 17:06:33 +0200 Subject: [PATCH] Adding num_like test for Czech (#5946) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Create lex_attrs.py Hello, I am missing a CZECH language in SpaCy. So I would like to help to push it a little. This file is base on others lex_attrs.py files just with translation to Czech. * Update __init__.py Updated for use with new Czech Lex_attrs file * Update stop_words.py * Create test_text.py * add like_num testing for czech Co-authored-by: holubvl3 <47881982+holubvl3@users.noreply.github.com> Co-authored-by: holubvl3 Co-authored-by: Vladimír Holubec --- spacy/tests/conftest.py | 5 +++++ spacy/tests/lang/cs/__init__.py | 0 spacy/tests/lang/cs/test_text.py | 26 ++++++++++++++++++++++++++ 3 files changed, 31 insertions(+) create mode 100644 spacy/tests/lang/cs/__init__.py create mode 100644 spacy/tests/lang/cs/test_text.py diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 91b7e4d9d..567bf901c 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -47,6 +47,11 @@ def ca_tokenizer(): return get_lang_class("ca").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def cs_tokenizer(): + return get_lang_class("cs").Defaults.create_tokenizer() + + @pytest.fixture(scope="session") def da_tokenizer(): return get_lang_class("da").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/cs/__init__.py b/spacy/tests/lang/cs/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/cs/test_text.py b/spacy/tests/lang/cs/test_text.py new file mode 100644 index 000000000..d98961738 --- /dev/null +++ b/spacy/tests/lang/cs/test_text.py @@ -0,0 +1,26 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10.000", True), + ("1000", True), + ("999,0", True), + ("devatenáct", True), + ("osmdesát", True), + ("kvadrilion", True), + ("Pes", False), + (",", False), + ("1/2", True), + ], +) +def test_lex_attrs_like_number(cs_tokenizer, text, match): + tokens = cs_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match