Merge pull request #8405 from svlandeg/fix/whitespace_tokenizer [ci skip]

This commit is contained in:
Ines Montani 2021-06-30 20:52:59 +10:00 committed by GitHub
commit af9d984407
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 45 additions and 1 deletions

View File

@ -421,6 +421,37 @@ def test_language_from_config_before_after_init_invalid():
English.from_config(config) English.from_config(config)
def test_language_whitespace_tokenizer():
"""Test the custom whitespace tokenizer from the docs."""
class WhitespaceTokenizer:
def __init__(self, vocab):
self.vocab = vocab
def __call__(self, text):
words = text.split(" ")
spaces = [True] * len(words)
# Avoid zero-length tokens
for i, word in enumerate(words):
if word == "":
words[i] = " "
spaces[i] = False
# Remove the final trailing space
if words[-1] == " ":
words = words[0:-1]
spaces = spaces[0:-1]
else:
spaces[-1] = False
return Doc(self.vocab, words=words, spaces=spaces)
nlp = spacy.blank("en")
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
text = " What's happened to me? he thought. It wasn't a dream. "
doc = nlp(text)
assert doc.text == text
def test_language_custom_tokenizer(): def test_language_custom_tokenizer():
"""Test that a fully custom tokenizer can be plugged in via the registry.""" """Test that a fully custom tokenizer can be plugged in via the registry."""
name = "test_language_custom_tokenizer" name = "test_language_custom_tokenizer"

View File

@ -1169,7 +1169,20 @@ class WhitespaceTokenizer:
def __call__(self, text): def __call__(self, text):
words = text.split(" ") words = text.split(" ")
return Doc(self.vocab, words=words) spaces = [True] * len(words)
# Avoid zero-length tokens
for i, word in enumerate(words):
if word == "":
words[i] = " "
spaces[i] = False
# Remove the final trailing space
if words[-1] == " ":
words = words[0:-1]
spaces = spaces[0:-1]
else:
spaces[-1] = False
return Doc(self.vocab, words=words, spaces=spaces)
nlp = spacy.blank("en") nlp = spacy.blank("en")
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)