From f08c871adf6f126c2ea7112804c813b977bcb167 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 29 Jun 2018 14:32:16 +0200 Subject: [PATCH 1/8] Fix typo in Language.from_disk --- spacy/language.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index e1e01d0ca..6b0ee6361 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -650,7 +650,7 @@ class Language(object): for name, proc in self.pipeline: if name in disable: continue - if not hasattr(proc, 'to_disk'): + if not hasattr(proc, 'from_disk'): continue deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False) exclude = {p: False for p in disable} From 526be4082329d16ecf7b1fa40b81f2008396a325 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 29 Jun 2018 14:33:12 +0200 Subject: [PATCH 2/8] Add test for 46d8a66 --- .../serialize/test_serialize_language.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py index 9b6a011c9..5d1ac4c92 100644 --- a/spacy/tests/serialize/test_serialize_language.py +++ b/spacy/tests/serialize/test_serialize_language.py @@ -3,8 +3,10 @@ from __future__ import unicode_literals from ..util import make_tempdir from ...language import Language +from ...tokenizer import Tokenizer import pytest +import re @pytest.fixture @@ -27,3 +29,24 @@ def test_serialize_language_meta_disk(meta_data): language.to_disk(d) new_language = Language().from_disk(d) assert new_language.meta == language.meta + + +def test_serialize_with_custom_tokenizer(): + """Test that serialization with custom tokenizer works without token_match. + See: https://support.prodi.gy/t/how-to-save-a-custom-tokenizer/661/2 + """ + prefix_re = re.compile(r'''1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:''') + suffix_re = re.compile(r'''''') + infix_re = re.compile(r'''[~]''') + + def custom_tokenizer(nlp): + return Tokenizer(nlp.vocab, + {}, + prefix_search=prefix_re.search, + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer) + + nlp = Language() + nlp.tokenizer = custom_tokenizer(nlp) + with make_tempdir() as d: + nlp.to_disk(d) From 3786942ff10de2c5144daa963d103a6549145db7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 29 Jun 2018 15:13:45 +0200 Subject: [PATCH 3/8] Fix tagger when docs are empty --- spacy/pipeline.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index ed4e4c066..339bf4f1c 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -449,7 +449,8 @@ class Tagger(Pipe): def predict(self, docs): if not any(len(doc) for doc in docs): # Handle case where there are no tokens in any docs. - return [self.model.ops.allocate((0, self.model.nO)) for doc in docs] + n_labels = len(self.labels) + return [self.model.ops.allocate((0, n_labels)) for doc in docs] tokvecs = self.model.tok2vec(docs) scores = self.model.softmax(tokvecs) guesses = [] From a1b05048d0da75f02b64a9b4719ce40137551234 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 29 Jun 2018 16:05:40 +0200 Subject: [PATCH 4/8] Fix tagger when doc is empty --- spacy/pipeline.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 339bf4f1c..faea20935 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -450,7 +450,9 @@ class Tagger(Pipe): if not any(len(doc) for doc in docs): # Handle case where there are no tokens in any docs. n_labels = len(self.labels) - return [self.model.ops.allocate((0, n_labels)) for doc in docs] + guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs] + tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO)) + return guesses, tokvecs tokvecs = self.model.tok2vec(docs) scores = self.model.softmax(tokvecs) guesses = [] From d0f9f13543272c4ca3514d2b500f4f61ca521d59 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 29 Jun 2018 19:01:44 +0200 Subject: [PATCH 5/8] Update Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 84f026180..928bc81ff 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ SHELL := /bin/bash sha = $(shell "git" "rev-parse" "--short" "HEAD") -dist/spacy.pex : +dist/spacy.pex : spacy/*.pyx spacy/*.pxd spacy/*/*.pyx spacy/*/*.pxd python3.6 -m venv env3.6 source env3.6/bin/activate env3.6/bin/pip install wheel From 01ace9734d776a53cd835d482692e11f748d722a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 29 Jun 2018 19:21:38 +0200 Subject: [PATCH 6/8] Make pipeline work on empty docs --- spacy/pipeline.pyx | 2 +- spacy/syntax/nn_parser.pyx | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index faea20935..edc793158 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -482,7 +482,7 @@ class Tagger(Pipe): if lemma != 0 and lemma != doc.c[j].lex.orth: doc.c[j].lemma = lemma idx += 1 - if tensors is not None: + if tensors is not None and len(tensors): if isinstance(doc.tensor, numpy.ndarray) \ and not isinstance(tensors[i], numpy.ndarray): doc.extend_tensor(tensors[i].get()) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 21ee603a3..91e4b6852 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -217,6 +217,8 @@ cdef class Parser: def predict(self, docs, beam_width=1, beam_density=0.0, drop=0.): if isinstance(docs, Doc): docs = [docs] + if not any(len(doc) for doc in docs): + return self.moves.init_batch(docs) if beam_width < 2: return self.greedy_parse(docs, drop=drop) else: From 2ec2192000e99414a7bbbf11c069c92606da4dc5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 29 Jun 2018 19:43:02 +0200 Subject: [PATCH 7/8] Revert #1389: Don't overrule rules when lemma exception is present --- spacy/lemmatizer.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index b4323e424..ee1a35ef1 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -95,16 +95,15 @@ def lemmatize(string, index, exceptions, rules): forms = [] forms.extend(exceptions.get(string, [])) oov_forms = [] - if not forms: - for old, new in rules: - if string.endswith(old): - form = string[:len(string) - len(old)] + new - if not form: - pass - elif form in index or not form.isalpha(): - forms.append(form) - else: - oov_forms.append(form) + for old, new in rules: + if string.endswith(old): + form = string[:len(string) - len(old)] + new + if not form: + pass + elif form in index or not form.isalpha(): + forms.append(form) + else: + oov_forms.append(form) if not forms: forms.extend(oov_forms) if not forms: From 3c3020fccc82027aa9dcc6a32741cc7505dbbd0e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 29 Jun 2018 21:21:30 +0200 Subject: [PATCH 8/8] Update Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 928bc81ff..ce148c9a0 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ SHELL := /bin/bash sha = $(shell "git" "rev-parse" "--short" "HEAD") -dist/spacy.pex : spacy/*.pyx spacy/*.pxd spacy/*/*.pyx spacy/*/*.pxd +dist/spacy.pex : spacy/*.py* spacy/*/*.py* python3.6 -m venv env3.6 source env3.6/bin/activate env3.6/bin/pip install wheel