From d7f32b285c472a7c89aad6ad050dd7bfad3ff6e3 Mon Sep 17 00:00:00 2001 From: adrianeboyd <adrianeboyd@gmail.com> Date: Wed, 20 Nov 2019 16:31:29 +0100 Subject: [PATCH] Detect more empty matches in tokenizer.explain() (#4675) * Detect more empty matches in tokenizer.explain() * Include a few languages in explain non-slow tests Mark a few languages in tokenizer.explain() tests as not slow so they're run by default. --- spacy/tests/tokenizer/test_explain.py | 87 ++++++++++++++------------- spacy/tokenizer.pyx | 16 ++--- 2 files changed, 53 insertions(+), 50 deletions(-) diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py index df707bc38..2d71588cc 100644 --- a/spacy/tests/tokenizer/test_explain.py +++ b/spacy/tests/tokenizer/test_explain.py @@ -8,57 +8,58 @@ from spacy.util import get_lang_class # "is" seems to confuse importlib, so we're also excluding it for now # excluded: ja, ru, th, uk, vi, zh, is LANGUAGES = [ - "af", - "ar", - "bg", + pytest.param("fr", marks=pytest.mark.slow()), + pytest.param("af", marks=pytest.mark.slow()), + pytest.param("ar", marks=pytest.mark.slow()), + pytest.param("bg", marks=pytest.mark.slow()), "bn", - pytest.param("ca", marks=pytest.mark.xfail()), - "cs", - "da", - "de", + pytest.param("ca", marks=pytest.mark.slow()), + pytest.param("cs", marks=pytest.mark.slow()), + pytest.param("da", marks=pytest.mark.slow()), + pytest.param("de", marks=pytest.mark.slow()), "el", "en", - "es", - "et", - "fa", - "fi", - pytest.param("fr", marks=pytest.mark.xfail()), - "ga", - "he", - "hi", - "hr", - pytest.param("hu", marks=pytest.mark.xfail()), - "id", - "it", - "kn", - "lt", - "lv", - "nb", - "nl", - pytest.param("pl", marks=pytest.mark.xfail()), - "pt", - "ro", - "si", - "sk", - "sl", - "sq", - "sr", - "sv", - "ta", - "te", - "tl", - "tr", - "tt", - "ur", + pytest.param("es", marks=pytest.mark.slow()), + pytest.param("et", marks=pytest.mark.slow()), + pytest.param("fa", marks=pytest.mark.slow()), + pytest.param("fi", marks=pytest.mark.slow()), + "fr", + pytest.param("ga", marks=pytest.mark.slow()), + pytest.param("he", marks=pytest.mark.slow()), + pytest.param("hi", marks=pytest.mark.slow()), + pytest.param("hr", marks=pytest.mark.slow()), + "hu", + pytest.param("id", marks=pytest.mark.slow()), + pytest.param("it", marks=pytest.mark.slow()), + pytest.param("kn", marks=pytest.mark.slow()), + pytest.param("lb", marks=pytest.mark.slow()), + pytest.param("lt", marks=pytest.mark.slow()), + pytest.param("lv", marks=pytest.mark.slow()), + pytest.param("nb", marks=pytest.mark.slow()), + pytest.param("nl", marks=pytest.mark.slow()), + "pl", + pytest.param("pt", marks=pytest.mark.slow()), + pytest.param("ro", marks=pytest.mark.slow()), + pytest.param("si", marks=pytest.mark.slow()), + pytest.param("sk", marks=pytest.mark.slow()), + pytest.param("sl", marks=pytest.mark.slow()), + pytest.param("sq", marks=pytest.mark.slow()), + pytest.param("sr", marks=pytest.mark.slow()), + pytest.param("sv", marks=pytest.mark.slow()), + pytest.param("ta", marks=pytest.mark.slow()), + pytest.param("te", marks=pytest.mark.slow()), + pytest.param("tl", marks=pytest.mark.slow()), + pytest.param("tr", marks=pytest.mark.slow()), + pytest.param("tt", marks=pytest.mark.slow()), + pytest.param("ur", marks=pytest.mark.slow()), ] -@pytest.mark.slow @pytest.mark.parametrize("lang", LANGUAGES) def test_tokenizer_explain(lang): - nlp = get_lang_class(lang)() + tokenizer = get_lang_class(lang).Defaults.create_tokenizer() examples = pytest.importorskip("spacy.lang.{}.examples".format(lang)) for sentence in examples.sentences: - tokens = [t.text for t in nlp.tokenizer(sentence) if not t.is_space] - debug_tokens = [t[1] for t in nlp.tokenizer.explain(sentence)] + tokens = [t.text for t in tokenizer(sentence) if not t.is_space] + debug_tokens = [t[1] for t in tokenizer.explain(sentence)] assert tokens == debug_tokens diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index f95b44283..230f41921 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -461,20 +461,20 @@ cdef class Tokenizer: break if prefix_search(substring): split = prefix_search(substring).end() + # break if pattern matches the empty string + if split == 0: + break tokens.append(("PREFIX", substring[:split])) substring = substring[split:] if substring in special_cases: continue - # break if pattern matches the empty string - if split == 0: - break if suffix_search(substring): split = suffix_search(substring).start() - suffixes.append(("SUFFIX", substring[split:])) - substring = substring[:split] # break if pattern matches the empty string if split == len(substring): break + suffixes.append(("SUFFIX", substring[split:])) + substring = substring[:split] if substring in special_cases: tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) substring = '' @@ -485,8 +485,10 @@ cdef class Tokenizer: infixes = infix_finditer(substring) offset = 0 for match in infixes: - tokens.append(("TOKEN", substring[offset : match.start()])) - tokens.append(("INFIX", substring[match.start() : match.end()])) + if substring[offset : match.start()]: + tokens.append(("TOKEN", substring[offset : match.start()])) + if substring[match.start() : match.end()]: + tokens.append(("INFIX", substring[match.start() : match.end()])) offset = match.end() if substring[offset:]: tokens.append(("TOKEN", substring[offset:]))