diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index db6843e8f..54b7987ff 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -32,11 +32,27 @@ def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec": class Tok2Vec(Pipe): + """Apply a "token-to-vector" model and set its outputs in the doc.tensor + attribute. This is mostly useful to share a single subnetwork between multiple + components, e.g. to have one embedding and CNN network shared between a + parser, tagger and NER. + + In order to use the `Tok2Vec` predictions, subsequent components should use + the `Tok2VecListener` layer as the tok2vec subnetwork of their model. This + layer will read data from the `doc.tensor` attribute during prediction. + During training, the `Tok2Vec` component will save its prediction and backprop + callback for each batch, so that the subsequent components can backpropagate + to the shared weights. This implementation is used because it allows us to + avoid relying on object identity within the models to achieve the parameter + sharing. + """ def __init__(self, vocab: Vocab, model: Model, name: str = "tok2vec") -> None: """Initialize a tok2vec component. vocab (Vocab): The shared vocabulary. - model (thinc.api.Model): The Thinc Model powering the pipeline component. + model (thinc.api.Model[List[Doc], List[Floats2d]]): + The Thinc Model powering the pipeline component. It should take + a list of Doc objects as input, and output a list of 2d float arrays. name (str): The component instance name. DOCS: https://spacy.io/api/tok2vec#init @@ -48,9 +64,18 @@ class Tok2Vec(Pipe): self.cfg = {} def add_listener(self, listener: "Tok2VecListener") -> None: + """Add a listener for a downstream component. Usually internals.""" self.listeners.append(listener) def find_listeners(self, model: Model) -> None: + """Walk over a model, looking for layers that are Tok2vecListener + subclasses that have an upstream_name that matches this component. + Listeners can also set their upstream_name attribute to the wildcard + string '*' to match any `Tok2Vec`. + + You're unlikely to ever need multiple `Tok2Vec` components, so it's + fine to leave your listeners upstream_name on '*'. + """ for node in model.walk(): if isinstance(node, Tok2VecListener) and node.upstream_name in ( "*", @@ -59,7 +84,8 @@ class Tok2Vec(Pipe): self.add_listener(node) def __call__(self, doc: Doc) -> Doc: - """Add context-sensitive embeddings to the Doc.tensor attribute. + """Add context-sensitive embeddings to the Doc.tensor attribute, allowing + them to be used as features by downstream components. docs (Doc): The Doc to preocess. RETURNS (Doc): The processed Doc. @@ -205,11 +231,26 @@ class Tok2Vec(Pipe): class Tok2VecListener(Model): """A layer that gets fed its answers from an upstream connection, for instance from a component earlier in the pipeline. - """ + The Tok2VecListener layer is used as a sublayer within a component such + as a parser, NER or text categorizer. Usually you'll have multiple listeners + connecting to a single upstream Tok2Vec component, that's earlier in the + pipeline. The Tok2VecListener layers act as proxies, passing the predictions + from the Tok2Vec component into downstream components, and communicating + gradients back upstream. + """ name = "tok2vec-listener" def __init__(self, upstream_name: str, width: int) -> None: + """ + upstream_name (str): A string to identify the 'upstream' Tok2Vec component + to communicate with. The upstream name should either be the wildcard + string '*', or the name of the `Tok2Vec` component. You'll almost + never have multiple upstream Tok2Vec components, so the wildcard + string will almost always be fine. + width (int): + The width of the vectors produced by the upstream tok2vec component. + """ Model.__init__(self, name=self.name, forward=forward, dims={"nO": width}) self.upstream_name = upstream_name self._batch_id = None @@ -217,15 +258,25 @@ class Tok2VecListener(Model): self._backprop = None @classmethod - def get_batch_id(cls, inputs) -> int: + def get_batch_id(cls, inputs: List[Doc]) -> int: + """Calculate a content-sensitive hash of the batch of documents, to check + whether the next batch of documents is unexpected. + """ return sum(sum(token.orth for token in doc) for doc in inputs) def receive(self, batch_id: int, outputs, backprop) -> None: + """Store a batch of training predictions and a backprop callback. The + predictions and callback are produced by the upstream Tok2Vec component, + and later will be used when the listener's component's model is called. + """ self._batch_id = batch_id self._outputs = outputs self._backprop = backprop def verify_inputs(self, inputs) -> bool: + """Check that the batch of Doc objects matches the ones we have a + prediction for. + """ if self._batch_id is None and self._outputs is None: raise ValueError(Errors.E954) else: @@ -237,6 +288,7 @@ class Tok2VecListener(Model): def forward(model: Tok2VecListener, inputs, is_train: bool): + """Supply the outputs from the upstream Tok2Vec component.""" if is_train: model.verify_inputs(inputs) return model._outputs, model._backprop