Add docstrings for Pipe API

This commit is contained in:
Matthew Honnibal 2017-09-25 16:20:49 +02:00
parent 1d73dec8b1
commit 39f390dba7
1 changed files with 34 additions and 3 deletions

View File

@ -88,17 +88,30 @@ class BaseThincComponent(object):
@classmethod @classmethod
def Model(cls, *shape, **kwargs): def Model(cls, *shape, **kwargs):
'''Initialize a model for the pipe.'''
raise NotImplementedError raise NotImplementedError
def __init__(self, vocab, model=True, **cfg): def __init__(self, vocab, model=True, **cfg):
'''Create a new pipe instance.'''
raise NotImplementedError raise NotImplementedError
def __call__(self, doc): def __call__(self, doc):
'''Apply the pipe to one document. The document is
modified in-place, and returned.
Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods.
'''
scores = self.predict([doc]) scores = self.predict([doc])
self.set_annotations([doc], scores) self.set_annotations([doc], scores)
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128, n_threads=-1):
'''Apply the pipe to a stream of documents.
Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods.
'''
for docs in cytoolz.partition_all(batch_size, stream): for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs) docs = list(docs)
scores = self.predict(docs) scores = self.predict(docs)
@ -106,27 +119,42 @@ class BaseThincComponent(object):
yield from docs yield from docs
def predict(self, docs): def predict(self, docs):
'''Apply the pipeline's model to a batch of docs, without
modifying them.
'''
raise NotImplementedError raise NotImplementedError
def set_annotations(self, docs, scores): def set_annotations(self, docs, scores):
'''Modify a batch of documents, using pre-computed scores.'''
raise NotImplementedError raise NotImplementedError
def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None): def update(self, docs, golds, drop=0., sgd=None, losses=None):
'''Learn from a batch of documents and gold-standard information,
updating the pipe's model.
Delegates to predict() and get_loss().
'''
raise NotImplementedError raise NotImplementedError
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
'''Find the loss and gradient of loss for the batch of
documents and their predicted scores.'''
raise NotImplementedError raise NotImplementedError
def begin_training(self, gold_tuples=tuple(), pipeline=None): def begin_training(self, gold_tuples=tuple(), pipeline=None):
token_vector_width = pipeline[0].model.nO '''Initialize the pipe for training, using data exampes if available.
If no model has been initialized yet, the model is added.'''
if self.model is True: if self.model is True:
self.model = self.Model(1, token_vector_width) self.model = self.Model(**self.cfg)
def use_params(self, params): def use_params(self, params):
'''Modify the pipe's model, to use the given parameter values.
'''
with self.model.use_params(params): with self.model.use_params(params):
yield yield
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
'''Serialize the pipe to a bytestring.'''
serialize = OrderedDict(( serialize = OrderedDict((
('cfg', lambda: json_dumps(self.cfg)), ('cfg', lambda: json_dumps(self.cfg)),
('model', lambda: self.model.to_bytes()), ('model', lambda: self.model.to_bytes()),
@ -135,6 +163,7 @@ class BaseThincComponent(object):
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, **exclude):
'''Load the pipe from a bytestring.'''
def load_model(b): def load_model(b):
if self.model is True: if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length self.cfg['pretrained_dims'] = self.vocab.vectors_length
@ -150,6 +179,7 @@ class BaseThincComponent(object):
return self return self
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
'''Serialize the pipe to disk.'''
serialize = OrderedDict(( serialize = OrderedDict((
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))), ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
('model', lambda p: p.open('wb').write(self.model.to_bytes())), ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
@ -158,6 +188,7 @@ class BaseThincComponent(object):
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude): def from_disk(self, path, **exclude):
'''Load the pipe from disk.'''
def load_model(p): def load_model(p):
if self.model is True: if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length self.cfg['pretrained_dims'] = self.vocab.vectors_length