diff --git a/Makefile b/Makefile index 84f026180..ce148c9a0 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ SHELL := /bin/bash sha = $(shell "git" "rev-parse" "--short" "HEAD") -dist/spacy.pex : +dist/spacy.pex : spacy/*.py* spacy/*/*.py* python3.6 -m venv env3.6 source env3.6/bin/activate env3.6/bin/pip install wheel diff --git a/spacy/language.py b/spacy/language.py index e1e01d0ca..6b0ee6361 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -650,7 +650,7 @@ class Language(object): for name, proc in self.pipeline: if name in disable: continue - if not hasattr(proc, 'to_disk'): + if not hasattr(proc, 'from_disk'): continue deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False) exclude = {p: False for p in disable} diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index b4323e424..ee1a35ef1 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -95,16 +95,15 @@ def lemmatize(string, index, exceptions, rules): forms = [] forms.extend(exceptions.get(string, [])) oov_forms = [] - if not forms: - for old, new in rules: - if string.endswith(old): - form = string[:len(string) - len(old)] + new - if not form: - pass - elif form in index or not form.isalpha(): - forms.append(form) - else: - oov_forms.append(form) + for old, new in rules: + if string.endswith(old): + form = string[:len(string) - len(old)] + new + if not form: + pass + elif form in index or not form.isalpha(): + forms.append(form) + else: + oov_forms.append(form) if not forms: forms.extend(oov_forms) if not forms: diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index ed4e4c066..edc793158 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -449,7 +449,10 @@ class Tagger(Pipe): def predict(self, docs): if not any(len(doc) for doc in docs): # Handle case where there are no tokens in any docs. - return [self.model.ops.allocate((0, self.model.nO)) for doc in docs] + n_labels = len(self.labels) + guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs] + tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO)) + return guesses, tokvecs tokvecs = self.model.tok2vec(docs) scores = self.model.softmax(tokvecs) guesses = [] @@ -479,7 +482,7 @@ class Tagger(Pipe): if lemma != 0 and lemma != doc.c[j].lex.orth: doc.c[j].lemma = lemma idx += 1 - if tensors is not None: + if tensors is not None and len(tensors): if isinstance(doc.tensor, numpy.ndarray) \ and not isinstance(tensors[i], numpy.ndarray): doc.extend_tensor(tensors[i].get()) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 21ee603a3..91e4b6852 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -217,6 +217,8 @@ cdef class Parser: def predict(self, docs, beam_width=1, beam_density=0.0, drop=0.): if isinstance(docs, Doc): docs = [docs] + if not any(len(doc) for doc in docs): + return self.moves.init_batch(docs) if beam_width < 2: return self.greedy_parse(docs, drop=drop) else: diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py index 9b6a011c9..5d1ac4c92 100644 --- a/spacy/tests/serialize/test_serialize_language.py +++ b/spacy/tests/serialize/test_serialize_language.py @@ -3,8 +3,10 @@ from __future__ import unicode_literals from ..util import make_tempdir from ...language import Language +from ...tokenizer import Tokenizer import pytest +import re @pytest.fixture @@ -27,3 +29,24 @@ def test_serialize_language_meta_disk(meta_data): language.to_disk(d) new_language = Language().from_disk(d) assert new_language.meta == language.meta + + +def test_serialize_with_custom_tokenizer(): + """Test that serialization with custom tokenizer works without token_match. + See: https://support.prodi.gy/t/how-to-save-a-custom-tokenizer/661/2 + """ + prefix_re = re.compile(r'''1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:''') + suffix_re = re.compile(r'''''') + infix_re = re.compile(r'''[~]''') + + def custom_tokenizer(nlp): + return Tokenizer(nlp.vocab, + {}, + prefix_search=prefix_re.search, + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer) + + nlp = Language() + nlp.tokenizer = custom_tokenizer(nlp) + with make_tempdir() as d: + nlp.to_disk(d)