extend example to ensure the text is preserved

This commit is contained in:
svlandeg 2021-06-16 23:56:35 +02:00
parent 29d83dec0c
commit bb9d2f1546
2 changed files with 33 additions and 6 deletions

View File

@ -427,13 +427,27 @@ def test_language_whitespace_tokenizer():
self.vocab = vocab
def __call__(self, text):
words = text.split()
return Doc(self.vocab, words=words)
words = text.split(" ")
spaces = [True] * len(words)
# Avoid zero-length tokens
for i, word in enumerate(words):
if word == "":
words[i] = " "
spaces[i] = False
# Remove the final trailing space
if words[-1] == " ":
words = words[0:-1]
spaces = spaces[0:-1]
else:
spaces[-1] = False
return Doc(self.vocab, words=words, spaces=spaces)
nlp = spacy.blank("en")
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
doc = nlp("What's happened to me? he thought. It wasn't a dream. ")
assert doc
text = " What's happened to me? he thought. It wasn't a dream. "
doc = nlp(text)
assert doc.text == text
def test_language_custom_tokenizer():

View File

@ -1168,8 +1168,21 @@ class WhitespaceTokenizer:
self.vocab = vocab
def __call__(self, text):
words = text.split()
return Doc(self.vocab, words=words)
words = text.split(" ")
spaces = [True] * len(words)
# Avoid zero-length tokens
for i, word in enumerate(words):
if word == "":
words[i] = " "
spaces[i] = False
# Remove the final trailing space
if words[-1] == " ":
words = words[0:-1]
spaces = spaces[0:-1]
else:
spaces[-1] = False
return Doc(self.vocab, words=words, spaces=spaces)
nlp = spacy.blank("en")
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)