From 38e07ade4c2e940567c4d6f54674a9fcedf920c6 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 6 Jul 2018 12:40:51 +0200 Subject: [PATCH] Add test for custom tokenizer serialization (resolves #2494) --- spacy/tests/serialize/test_serialize_tokenizer.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py index 6d4145504..de022a263 100644 --- a/spacy/tests/serialize/test_serialize_tokenizer.py +++ b/spacy/tests/serialize/test_serialize_tokenizer.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from ...util import get_lang_class +from ...tokenizer import Tokenizer from ..util import make_tempdir, assert_packed_msg_equal import pytest @@ -13,6 +14,14 @@ def load_tokenizer(b): return tok +def test_serialize_custom_tokenizer(en_vocab, en_tokenizer): + """Test that custom tokenizer with not all functions defined can be + serialized and deserialized correctly (see #2494).""" + tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search) + tokenizer_bytes = tokenizer.to_bytes() + new_tokenizer = Tokenizer(en_vocab).from_bytes(tokenizer_bytes) + + @pytest.mark.skip(reason="Currently unreliable across platforms") @pytest.mark.parametrize('text', ["I💜you", "they’re", "“hello”"]) def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):