2017-06-03 11:26:34 +00:00
|
|
|
|
# coding: utf-8
|
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
2017-06-03 15:05:28 +00:00
|
|
|
|
from ...util import get_lang_class
|
|
|
|
|
from ..util import make_tempdir, assert_packed_msg_equal
|
2017-06-03 11:26:34 +00:00
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
|
|
|
2017-06-03 15:05:28 +00:00
|
|
|
|
def load_tokenizer(b):
|
|
|
|
|
tok = get_lang_class('en').Defaults.create_tokenizer()
|
|
|
|
|
tok.from_bytes(b)
|
|
|
|
|
return tok
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize('text', ["I💜you", "they’re", "“hello”"])
|
2017-06-03 11:26:34 +00:00
|
|
|
|
def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):
|
2017-06-03 15:05:28 +00:00
|
|
|
|
tokenizer = en_tokenizer
|
|
|
|
|
new_tokenizer = load_tokenizer(tokenizer.to_bytes())
|
|
|
|
|
assert_packed_msg_equal(new_tokenizer.to_bytes(), tokenizer.to_bytes())
|
|
|
|
|
# assert new_tokenizer.to_bytes() == tokenizer.to_bytes()
|
|
|
|
|
doc1 = tokenizer(text)
|
2017-06-03 11:26:34 +00:00
|
|
|
|
doc2 = new_tokenizer(text)
|
|
|
|
|
assert [token.text for token in doc1] == [token.text for token in doc2]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_serialize_tokenizer_roundtrip_disk(en_tokenizer):
|
|
|
|
|
tokenizer = en_tokenizer
|
|
|
|
|
with make_tempdir() as d:
|
|
|
|
|
file_path = d / 'tokenizer'
|
|
|
|
|
tokenizer.to_disk(file_path)
|
|
|
|
|
tokenizer_d = en_tokenizer.from_disk(file_path)
|
|
|
|
|
assert tokenizer.to_bytes() == tokenizer_d.to_bytes()
|