2017-06-03 11:26:34 +00:00
|
|
|
|
import pytest
|
2021-01-14 06:31:29 +00:00
|
|
|
|
import re
|
2018-07-24 21:38:44 +00:00
|
|
|
|
from spacy.util import get_lang_class
|
|
|
|
|
from spacy.tokenizer import Tokenizer
|
|
|
|
|
|
|
|
|
|
from ..util import make_tempdir, assert_packed_msg_equal
|
2017-06-03 11:26:34 +00:00
|
|
|
|
|
|
|
|
|
|
2017-06-03 15:05:28 +00:00
|
|
|
|
def load_tokenizer(b):
|
2020-07-22 11:42:59 +00:00
|
|
|
|
tok = get_lang_class("en")().tokenizer
|
2017-06-03 15:05:28 +00:00
|
|
|
|
tok.from_bytes(b)
|
|
|
|
|
return tok
|
|
|
|
|
|
|
|
|
|
|
2018-07-06 10:40:51 +00:00
|
|
|
|
def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
|
2020-03-02 10:55:02 +00:00
|
|
|
|
"""Test that custom tokenizer with not all functions defined or empty
|
|
|
|
|
properties can be serialized and deserialized correctly (see #2494,
|
|
|
|
|
#4991)."""
|
2018-07-06 10:40:51 +00:00
|
|
|
|
tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search)
|
|
|
|
|
tokenizer_bytes = tokenizer.to_bytes()
|
2018-11-30 16:43:08 +00:00
|
|
|
|
Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
|
2018-07-06 10:40:51 +00:00
|
|
|
|
|
2021-01-14 06:31:29 +00:00
|
|
|
|
# test that empty/unset values are set correctly on deserialization
|
2021-01-14 09:49:30 +00:00
|
|
|
|
tokenizer = get_lang_class("en")().tokenizer
|
2021-01-14 06:31:29 +00:00
|
|
|
|
tokenizer.token_match = re.compile("test").match
|
|
|
|
|
assert tokenizer.rules != {}
|
|
|
|
|
assert tokenizer.token_match is not None
|
|
|
|
|
assert tokenizer.url_match is not None
|
2021-04-22 08:14:57 +00:00
|
|
|
|
assert tokenizer.prefix_search is not None
|
|
|
|
|
assert tokenizer.infix_finditer is not None
|
2021-01-14 06:31:29 +00:00
|
|
|
|
tokenizer.from_bytes(tokenizer_bytes)
|
|
|
|
|
assert tokenizer.rules == {}
|
|
|
|
|
assert tokenizer.token_match is None
|
|
|
|
|
assert tokenizer.url_match is None
|
2021-04-22 08:14:57 +00:00
|
|
|
|
assert tokenizer.prefix_search is None
|
|
|
|
|
assert tokenizer.infix_finditer is None
|
2021-01-14 06:31:29 +00:00
|
|
|
|
|
2020-03-25 11:28:12 +00:00
|
|
|
|
tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC"}, {"ORTH": "."}]})
|
2020-03-02 10:55:02 +00:00
|
|
|
|
tokenizer.rules = {}
|
|
|
|
|
tokenizer_bytes = tokenizer.to_bytes()
|
|
|
|
|
tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
|
|
|
|
|
assert tokenizer_reloaded.rules == {}
|
|
|
|
|
|
2018-07-06 10:40:51 +00:00
|
|
|
|
|
2018-11-27 00:09:36 +00:00
|
|
|
|
@pytest.mark.parametrize("text", ["I💜you", "they’re", "“hello”"])
|
2017-06-03 11:26:34 +00:00
|
|
|
|
def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):
|
2017-06-03 15:05:28 +00:00
|
|
|
|
tokenizer = en_tokenizer
|
|
|
|
|
new_tokenizer = load_tokenizer(tokenizer.to_bytes())
|
|
|
|
|
assert_packed_msg_equal(new_tokenizer.to_bytes(), tokenizer.to_bytes())
|
2018-07-06 10:40:28 +00:00
|
|
|
|
assert new_tokenizer.to_bytes() == tokenizer.to_bytes()
|
2017-06-03 15:05:28 +00:00
|
|
|
|
doc1 = tokenizer(text)
|
2017-06-03 11:26:34 +00:00
|
|
|
|
doc2 = new_tokenizer(text)
|
|
|
|
|
assert [token.text for token in doc1] == [token.text for token in doc2]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_serialize_tokenizer_roundtrip_disk(en_tokenizer):
|
|
|
|
|
tokenizer = en_tokenizer
|
|
|
|
|
with make_tempdir() as d:
|
2018-11-27 00:09:36 +00:00
|
|
|
|
file_path = d / "tokenizer"
|
2017-06-03 11:26:34 +00:00
|
|
|
|
tokenizer.to_disk(file_path)
|
|
|
|
|
tokenizer_d = en_tokenizer.from_disk(file_path)
|
|
|
|
|
assert tokenizer.to_bytes() == tokenizer_d.to_bytes()
|