This commit is contained in:
ines 2018-06-29 14:33:12 +02:00
parent f08c871adf
commit 526be40823
1 changed files with 23 additions and 0 deletions

View File

@ -3,8 +3,10 @@ from __future__ import unicode_literals
from ..util import make_tempdir
from ...language import Language
from ...tokenizer import Tokenizer
import pytest
import re
@pytest.fixture
@ -27,3 +29,24 @@ def test_serialize_language_meta_disk(meta_data):
language.to_disk(d)
new_language = Language().from_disk(d)
assert new_language.meta == language.meta
def test_serialize_with_custom_tokenizer():
"""Test that serialization with custom tokenizer works without token_match.
See: https://support.prodi.gy/t/how-to-save-a-custom-tokenizer/661/2
"""
prefix_re = re.compile(r'''1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:''')
suffix_re = re.compile(r'''''')
infix_re = re.compile(r'''[~]''')
def custom_tokenizer(nlp):
return Tokenizer(nlp.vocab,
{},
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer)
nlp = Language()
nlp.tokenizer = custom_tokenizer(nlp)
with make_tempdir() as d:
nlp.to_disk(d)