From bb9d2f15466ba32e221ec7138cfb30c1f0aabad6 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 16 Jun 2021 23:56:35 +0200 Subject: [PATCH] extend example to ensure the text is preserved --- spacy/tests/test_language.py | 22 ++++++++++++++++++---- website/docs/usage/linguistic-features.md | 17 +++++++++++++++-- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 72d1597fd..f5773ebd9 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -427,13 +427,27 @@ def test_language_whitespace_tokenizer(): self.vocab = vocab def __call__(self, text): - words = text.split() - return Doc(self.vocab, words=words) + words = text.split(" ") + spaces = [True] * len(words) + # Avoid zero-length tokens + for i, word in enumerate(words): + if word == "": + words[i] = " " + spaces[i] = False + # Remove the final trailing space + if words[-1] == " ": + words = words[0:-1] + spaces = spaces[0:-1] + else: + spaces[-1] = False + + return Doc(self.vocab, words=words, spaces=spaces) nlp = spacy.blank("en") nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) - doc = nlp("What's happened to me? he thought. It wasn't a dream. ") - assert doc + text = " What's happened to me? he thought. It wasn't a dream. " + doc = nlp(text) + assert doc.text == text def test_language_custom_tokenizer(): diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 7dc6cff25..ec44fe9f3 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1168,8 +1168,21 @@ class WhitespaceTokenizer: self.vocab = vocab def __call__(self, text): - words = text.split() - return Doc(self.vocab, words=words) + words = text.split(" ") + spaces = [True] * len(words) + # Avoid zero-length tokens + for i, word in enumerate(words): + if word == "": + words[i] = " " + spaces[i] = False + # Remove the final trailing space + if words[-1] == " ": + words = words[0:-1] + spaces = spaces[0:-1] + else: + spaces[-1] = False + + return Doc(self.vocab, words=words, spaces=spaces) nlp = spacy.blank("en") nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)