💫 Raise better error when using uninitialized pipeline component (#3074)

After creating a component, the `.model` attribute is left with the value `True`, to indicate it should be created later during `from_disk()`, `from_bytes()` or `begin_training()`. This had led to confusing errors if you try to use the component without initializing the model.

To fix this, we add a method `require_model()` to the `Pipe` base class. The `require_model()` method needs to be called at the start of the `.predict()` and `.update()` methods of the components. It raises a `ValueError` if the model is not initialized. An error message has been added to `spacy.errors`.
This commit is contained in:
Matthew Honnibal 2018-12-20 15:54:53 +01:00 committed by Ines Montani
parent c315e08e6e
commit 9ec9f89b99
3 changed files with 27 additions and 0 deletions

View File

@ -287,6 +287,8 @@ class Errors(object):
E108 = ("As of spaCy v2.1, the pipe name `sbd` has been deprecated " E108 = ("As of spaCy v2.1, the pipe name `sbd` has been deprecated "
"in favor of the pipe name `sentencizer`, which does the same " "in favor of the pipe name `sentencizer`, which does the same "
"thing. For example, use `nlp.create_pipeline('sentencizer')`") "thing. For example, use `nlp.create_pipeline('sentencizer')`")
E109 = ("Model for component '{name}' not initialized. Did you forget to load "
"a model, or forget to call begin_training()?")
@add_codes @add_codes

View File

@ -293,10 +293,16 @@ class Pipe(object):
Both __call__ and pipe should delegate to the `predict()` Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods. and `set_annotations()` methods.
""" """
self.require_model()
scores, tensors = self.predict([doc]) scores, tensors = self.predict([doc])
self.set_annotations([doc], scores, tensors=tensors) self.set_annotations([doc], scores, tensors=tensors)
return doc return doc
def require_model(self):
"""Raise an error if the component's model is not initialized."""
if getattr(self, 'model', None) in (None, True, False):
raise ValueError(Errors.E109.format(name=self.name))
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128, n_threads=-1):
"""Apply the pipe to a stream of documents. """Apply the pipe to a stream of documents.
@ -313,6 +319,7 @@ class Pipe(object):
"""Apply the pipeline's model to a batch of docs, without """Apply the pipeline's model to a batch of docs, without
modifying them. modifying them.
""" """
self.require_model()
raise NotImplementedError raise NotImplementedError
def set_annotations(self, docs, scores, tensors=None): def set_annotations(self, docs, scores, tensors=None):
@ -325,6 +332,7 @@ class Pipe(object):
Delegates to predict() and get_loss(). Delegates to predict() and get_loss().
""" """
self.require_model()
raise NotImplementedError raise NotImplementedError
def rehearse(self, docs, sgd=None, losses=None, **config): def rehearse(self, docs, sgd=None, losses=None, **config):
@ -495,6 +503,7 @@ class Tensorizer(Pipe):
docs (iterable): A sequence of `Doc` objects. docs (iterable): A sequence of `Doc` objects.
RETURNS (object): Vector representations for each token in the docs. RETURNS (object): Vector representations for each token in the docs.
""" """
self.require_model()
inputs = self.model.ops.flatten([doc.tensor for doc in docs]) inputs = self.model.ops.flatten([doc.tensor for doc in docs])
outputs = self.model(inputs) outputs = self.model(inputs)
return self.model.ops.unflatten(outputs, [len(d) for d in docs]) return self.model.ops.unflatten(outputs, [len(d) for d in docs])
@ -519,6 +528,7 @@ class Tensorizer(Pipe):
sgd (callable): An optimizer. sgd (callable): An optimizer.
RETURNS (dict): Results from the update. RETURNS (dict): Results from the update.
""" """
self.require_model()
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
inputs = [] inputs = []
@ -600,6 +610,7 @@ class Tagger(Pipe):
yield from docs yield from docs
def predict(self, docs): def predict(self, docs):
self.require_model()
if not any(len(doc) for doc in docs): if not any(len(doc) for doc in docs):
# Handle case where there are no tokens in any docs. # Handle case where there are no tokens in any docs.
n_labels = len(self.labels) n_labels = len(self.labels)
@ -644,6 +655,7 @@ class Tagger(Pipe):
doc.is_tagged = True doc.is_tagged = True
def update(self, docs, golds, drop=0., sgd=None, losses=None): def update(self, docs, golds, drop=0., sgd=None, losses=None):
self.require_model()
if losses is not None and self.name not in losses: if losses is not None and self.name not in losses:
losses[self.name] = 0. losses[self.name] = 0.
@ -904,6 +916,7 @@ class MultitaskObjective(Tagger):
return model return model
def predict(self, docs): def predict(self, docs):
self.require_model()
tokvecs = self.model.tok2vec(docs) tokvecs = self.model.tok2vec(docs)
scores = self.model.softmax(tokvecs) scores = self.model.softmax(tokvecs)
return tokvecs, scores return tokvecs, scores
@ -1042,6 +1055,7 @@ class ClozeMultitask(Pipe):
return sgd return sgd
def predict(self, docs): def predict(self, docs):
self.require_model()
tokvecs = self.model.tok2vec(docs) tokvecs = self.model.tok2vec(docs)
vectors = self.model.output_layer(tokvecs) vectors = self.model.output_layer(tokvecs)
return tokvecs, vectors return tokvecs, vectors
@ -1061,6 +1075,7 @@ class ClozeMultitask(Pipe):
pass pass
def rehearse(self, docs, drop=0., sgd=None, losses=None): def rehearse(self, docs, drop=0., sgd=None, losses=None):
self.require_model()
if losses is not None and self.name not in losses: if losses is not None and self.name not in losses:
losses[self.name] = 0. losses[self.name] = 0.
predictions, bp_predictions = self.model.begin_update(docs, drop=drop) predictions, bp_predictions = self.model.begin_update(docs, drop=drop)
@ -1105,9 +1120,11 @@ class SimilarityHook(Pipe):
yield self(doc) yield self(doc)
def predict(self, doc1, doc2): def predict(self, doc1, doc2):
self.require_model()
return self.model.predict([(doc1, doc2)]) return self.model.predict([(doc1, doc2)])
def update(self, doc1_doc2, golds, sgd=None, drop=0.): def update(self, doc1_doc2, golds, sgd=None, drop=0.):
self.require_model()
sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop) sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs): def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
@ -1171,6 +1188,7 @@ class TextCategorizer(Pipe):
yield from docs yield from docs
def predict(self, docs): def predict(self, docs):
self.require_model()
scores = self.model(docs) scores = self.model(docs)
scores = self.model.ops.asarray(scores) scores = self.model.ops.asarray(scores)
tensors = [doc.tensor for doc in docs] tensors = [doc.tensor for doc in docs]

View File

@ -226,8 +226,14 @@ cdef class Parser:
self.set_annotations(subbatch, parse_states, tensors=None) self.set_annotations(subbatch, parse_states, tensors=None)
for doc in batch_in_order: for doc in batch_in_order:
yield doc yield doc
def require_model(self):
"""Raise an error if the component's model is not initialized."""
if getattr(self, 'model', None) in (None, True, False):
raise ValueError(Errors.E109.format(name=self.name))
def predict(self, docs, beam_width=1, beam_density=0.0, drop=0.): def predict(self, docs, beam_width=1, beam_density=0.0, drop=0.):
self.require_model()
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
if not any(len(doc) for doc in docs): if not any(len(doc) for doc in docs):
@ -375,6 +381,7 @@ cdef class Parser:
return [b for b in beams if not b.is_done] return [b for b in beams if not b.is_done]
def update(self, docs, golds, drop=0., sgd=None, losses=None): def update(self, docs, golds, drop=0., sgd=None, losses=None):
self.require_model()
if isinstance(docs, Doc) and isinstance(golds, GoldParse): if isinstance(docs, Doc) and isinstance(golds, GoldParse):
docs = [docs] docs = [docs]
golds = [golds] golds = [golds]