From 9957ed7897c15850fd24e03f9c9aef53a1e6ee54 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 14 Jan 2021 07:31:29 +0100 Subject: [PATCH] Override language defaults for null token and URL match (#6705) * Override language defaults for null token and URL match When the serialized `token_match` or `url_match` is `None`, override the language defaults to preserve `None` on deserialization. * Fix fixtures in tests --- spacy/tests/serialize/test_serialize_tokenizer.py | 12 ++++++++++++ spacy/tokenizer.pyx | 6 ++++++ 2 files changed, 18 insertions(+) diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py index cbe119225..bd6b15dc4 100644 --- a/spacy/tests/serialize/test_serialize_tokenizer.py +++ b/spacy/tests/serialize/test_serialize_tokenizer.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import pytest +import re from spacy.util import get_lang_class from spacy.tokenizer import Tokenizer @@ -22,6 +23,17 @@ def test_serialize_custom_tokenizer(en_vocab, en_tokenizer): tokenizer_bytes = tokenizer.to_bytes() Tokenizer(en_vocab).from_bytes(tokenizer_bytes) + # test that empty/unset values are set correctly on deserialization + tokenizer = get_lang_class("en").Defaults.create_tokenizer() + tokenizer.token_match = re.compile("test").match + assert tokenizer.rules != {} + assert tokenizer.token_match is not None + assert tokenizer.url_match is not None + tokenizer.from_bytes(tokenizer_bytes) + assert tokenizer.rules == {} + assert tokenizer.token_match is None + assert tokenizer.url_match is None + tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC"}, {"ORTH": "."}]}) tokenizer.rules = {} tokenizer_bytes = tokenizer.to_bytes() diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 154a42c4f..bdc9e5f3d 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -608,10 +608,16 @@ cdef class Tokenizer: self.suffix_search = re.compile(data["suffix_search"]).search if "infix_finditer" in data and isinstance(data["infix_finditer"], basestring_): self.infix_finditer = re.compile(data["infix_finditer"]).finditer + # for token_match and url_match, set to None to override the language + # defaults if no regex is provided if "token_match" in data and isinstance(data["token_match"], basestring_): self.token_match = re.compile(data["token_match"]).match + else: + self.token_match = None if "url_match" in data and isinstance(data["url_match"], basestring_): self.url_match = re.compile(data["url_match"]).match + else: + self.url_match = None if "rules" in data and isinstance(data["rules"], dict): # make sure to hard reset the cache to remove data from the default exceptions self._rules = {}