Merge pull request #8405 from svlandeg/fix/whitespace_tokenizer [ci skip]

2021-06-30 20:52:59 +10:00 · 2021-06-30 20:52:59 +10:00 · af9d984407
parent 2b8c679a3d bb9d2f1546
commit af9d984407
2 changed files with 45 additions and 1 deletions
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -421,6 +421,37 @@ def test_language_from_config_before_after_init_invalid():
            English.from_config(config)
 def test_language_whitespace_tokenizer():
    """Test the custom whitespace tokenizer from the docs."""
    class WhitespaceTokenizer:
        def __init__(self, vocab):
            self.vocab = vocab
        def __call__(self, text):
            words = text.split(" ")
            spaces = [True] * len(words)
            # Avoid zero-length tokens
            for i, word in enumerate(words):
                if word == "":
                    words[i] = " "
                    spaces[i] = False
            # Remove the final trailing space
            if words[-1] == " ":
                words = words[0:-1]
                spaces = spaces[0:-1]
            else:
                spaces[-1] = False
            return Doc(self.vocab, words=words, spaces=spaces)
    nlp = spacy.blank("en")
    nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
    text = "   What's happened to    me? he thought. It wasn't a dream.    "
    doc = nlp(text)
    assert doc.text == text
 def test_language_custom_tokenizer():
    """Test that a fully custom tokenizer can be plugged in via the registry."""
    name = "test_language_custom_tokenizer"
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -1169,7 +1169,20 @@ class WhitespaceTokenizer:
    def __call__(self, text):
        words = text.split(" ")
-        return Doc(self.vocab, words=words)
+        spaces = [True] * len(words)
        # Avoid zero-length tokens
        for i, word in enumerate(words):
            if word == "":
                words[i] = " "
                spaces[i] = False
        # Remove the final trailing space
        if words[-1] == " ":
            words = words[0:-1]
            spaces = spaces[0:-1]
        else:
           spaces[-1] = False
        return Doc(self.vocab, words=words, spaces=spaces)
 nlp = spacy.blank("en")
 nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)