From 56eabcb2f2dd732e1c440468817a99350caf3e51 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Fri, 21 Aug 2020 17:06:33 +0200
Subject: [PATCH] Adding num_like test for Czech (#5946)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Create lex_attrs.py

Hello,

I am missing a CZECH language in SpaCy. So I would like to help to push it a little. This file is base on others lex_attrs.py files just with translation to Czech.

* Update __init__.py

Updated for use with new Czech Lex_attrs file

* Update stop_words.py

* Create test_text.py

* add like_num testing for czech

Co-authored-by: holubvl3 <47881982+holubvl3@users.noreply.github.com>
Co-authored-by: holubvl3 <vilemrousi@gmail.com>
Co-authored-by: Vladimír Holubec <vholubec@arcdata.cz>
---
 spacy/tests/conftest.py          |  5 +++++
 spacy/tests/lang/cs/__init__.py  |  0
 spacy/tests/lang/cs/test_text.py | 26 ++++++++++++++++++++++++++
 3 files changed, 31 insertions(+)
 create mode 100644 spacy/tests/lang/cs/__init__.py
 create mode 100644 spacy/tests/lang/cs/test_text.py

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 91b7e4d9d..567bf901c 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -47,6 +47,11 @@ def ca_tokenizer():
     return get_lang_class("ca").Defaults.create_tokenizer()
 
 
+@pytest.fixture(scope="session")
+def cs_tokenizer():
+    return get_lang_class("cs").Defaults.create_tokenizer()
+
+
 @pytest.fixture(scope="session")
 def da_tokenizer():
     return get_lang_class("da").Defaults.create_tokenizer()
diff --git a/spacy/tests/lang/cs/__init__.py b/spacy/tests/lang/cs/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/cs/test_text.py b/spacy/tests/lang/cs/test_text.py
new file mode 100644
index 000000000..d98961738
--- /dev/null
+++ b/spacy/tests/lang/cs/test_text.py
@@ -0,0 +1,26 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize(
+    "text,match",
+    [
+        ("10", True),
+        ("1", True),
+        ("10.000", True),
+        ("1000", True),
+        ("999,0", True),
+        ("devatenáct", True),
+        ("osmdesát", True),
+        ("kvadrilion", True),
+        ("Pes", False),
+        (",", False),
+        ("1/2", True),
+    ],
+)
+def test_lex_attrs_like_number(cs_tokenizer, text, match):
+    tokens = cs_tokenizer(text)
+    assert len(tokens) == 1
+    assert tokens[0].like_num == match