Add test for custom tokenizer serialization (resolves #2494)

This commit is contained in:
ines 2018-07-06 12:40:51 +02:00
parent c2581f9172
commit 38e07ade4c
1 changed files with 9 additions and 0 deletions

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals
from ...util import get_lang_class
from ...tokenizer import Tokenizer
from ..util import make_tempdir, assert_packed_msg_equal
import pytest
@ -13,6 +14,14 @@ def load_tokenizer(b):
return tok
def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
"""Test that custom tokenizer with not all functions defined can be
serialized and deserialized correctly (see #2494)."""
tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search)
tokenizer_bytes = tokenizer.to_bytes()
new_tokenizer = Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
@pytest.mark.skip(reason="Currently unreliable across platforms")
@pytest.mark.parametrize('text', ["I💜you", "theyre", "“hello”"])
def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):