Update docs and types

This commit is contained in:
Ines Montani 2020-07-31 17:02:54 +02:00
parent dab31426e1
commit e9e8fa2466
22 changed files with 232 additions and 137 deletions

View File

@ -1,6 +1,7 @@
from typing import List
from thinc.api import Model
from thinc.types import Floats2d
from ..tokens import Doc
@ -15,14 +16,14 @@ def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]:
)
def init(model, X=None, Y=None):
def init(model: Model, X=None, Y=None):
vectors_table = model.ops.alloc3f(
model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM")
)
model.set_param("E", vectors_table)
def forward(model, docs, is_train):
def forward(model: Model, docs: List[Doc], is_train: bool):
if docs is None:
return []
ids = []

View File

@ -14,7 +14,7 @@ def IOB() -> Model[Padded, Padded]:
)
def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None):
def init(model: Model, X: Optional[Padded] = None, Y: Optional[Padded] = None) -> None:
if X is not None and Y is not None:
if X.data.shape != Y.data.shape:
# TODO: Fix error

View File

@ -4,14 +4,14 @@ from thinc.api import Model
from ..attrs import LOWER
def extract_ngrams(ngram_size, attr=LOWER) -> Model:
def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model:
model = Model("extract_ngrams", forward)
model.attrs["ngram_size"] = ngram_size
model.attrs["attr"] = attr
return model
def forward(model, docs, is_train: bool):
def forward(model: Model, docs, is_train: bool):
batch_keys = []
batch_vals = []
for doc in docs:

View File

@ -1,5 +1,4 @@
from pathlib import Path
from typing import Optional
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
from thinc.api import Model, Maxout, Linear
@ -9,7 +8,7 @@ from ...vocab import Vocab
@registry.architectures.register("spacy.EntityLinker.v1")
def build_nel_encoder(tok2vec, nO=None):
def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
with Model.define_operators({">>": chain, "**": clone}):
token_width = tok2vec.get_dim("nO")
output_layer = Linear(nO=nO, nI=token_width)
@ -26,7 +25,7 @@ def build_nel_encoder(tok2vec, nO=None):
@registry.assets.register("spacy.KBFromFile.v1")
def load_kb(vocab_path, kb_path) -> KnowledgeBase:
def load_kb(vocab_path: str, kb_path: str) -> KnowledgeBase:
vocab = Vocab().from_disk(vocab_path)
kb = KnowledgeBase(vocab=vocab)
kb.load_bulk(kb_path)

View File

@ -1,10 +1,20 @@
from typing import Optional, Iterable, Tuple, List, TYPE_CHECKING
import numpy
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
from thinc.api import MultiSoftmax, list2array
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
from ...vocab import Vocab # noqa: F401
from ...tokens import Doc # noqa: F401
def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
def build_multi_task_model(
tok2vec: Model,
maxout_pieces: int,
token_vector_width: int,
nO: Optional[int] = None,
) -> Model:
softmax = Softmax(nO=nO, nI=token_vector_width * 2)
model = chain(
tok2vec,
@ -22,7 +32,13 @@ def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
return model
def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, hidden_size, nO=None):
def build_cloze_multi_task_model(
vocab: "Vocab",
tok2vec: Model,
maxout_pieces: int,
hidden_size: int,
nO: Optional[int] = None,
) -> Model:
# nO = vocab.vectors.data.shape[1]
output_layer = chain(
list2array(),
@ -43,24 +59,24 @@ def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, hidden_size, nO=
def build_cloze_characters_multi_task_model(
vocab, tok2vec, maxout_pieces, hidden_size, nr_char
):
vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int, nr_char: int
) -> Model:
output_layer = chain(
list2array(),
Maxout(hidden_size, nP=maxout_pieces),
LayerNorm(nI=hidden_size),
MultiSoftmax([256] * nr_char, nI=hidden_size),
)
model = build_masked_language_model(vocab, chain(tok2vec, output_layer))
model.set_ref("tok2vec", tok2vec)
model.set_ref("output_layer", output_layer)
return model
def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
def build_masked_language_model(
vocab: "Vocab", wrapped_model: Model, mask_prob: float = 0.15
) -> Model:
"""Convert a model into a BERT-style masked language model"""
random_words = _RandomWords(vocab)
def mlm_forward(model, docs, is_train):
@ -74,7 +90,7 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
return output, mlm_backward
def mlm_initialize(model, X=None, Y=None):
def mlm_initialize(model: Model, X=None, Y=None):
wrapped = model.layers[0]
wrapped.initialize(X=X, Y=Y)
for dim in wrapped.dim_names:
@ -90,12 +106,11 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
dims={dim: None for dim in wrapped_model.dim_names},
)
mlm_model.set_ref("wrapped", wrapped_model)
return mlm_model
class _RandomWords:
def __init__(self, vocab):
def __init__(self, vocab: "Vocab") -> None:
self.words = [lex.text for lex in vocab if lex.prob != 0.0]
self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
self.words = self.words[:10000]
@ -104,7 +119,7 @@ class _RandomWords:
self.probs /= self.probs.sum()
self._cache = []
def next(self):
def next(self) -> str:
if not self._cache:
self._cache.extend(
numpy.random.choice(len(self.words), 10000, p=self.probs)
@ -113,9 +128,11 @@ class _RandomWords:
return self.words[index]
def _apply_mask(docs, random_words, mask_prob=0.15):
def _apply_mask(
docs: Iterable["Doc"], random_words: _RandomWords, mask_prob: float = 0.15
) -> Tuple[numpy.ndarray, List["Doc"]]:
# This needs to be here to avoid circular imports
from ...tokens import Doc
from ...tokens import Doc # noqa: F811
N = sum(len(doc) for doc in docs)
mask = numpy.random.uniform(0.0, 1.0, (N,))
@ -141,7 +158,7 @@ def _apply_mask(docs, random_words, mask_prob=0.15):
return mask, masked_docs
def _replace_word(word, random_words, mask="[MASK]"):
def _replace_word(word: str, random_words: _RandomWords, mask: str = "[MASK]") -> str:
roll = numpy.random.random()
if roll < 0.8:
return mask

View File

@ -1,6 +1,5 @@
from pydantic import StrictInt
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops, with_array
from thinc.api import LayerNorm, Maxout, Mish
from typing import Optional
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
from ...util import registry
from .._precomputable_affine import PrecomputableAffine
@ -10,16 +9,15 @@ from ..tb_framework import TransitionModel
@registry.architectures.register("spacy.TransitionBasedParser.v1")
def build_tb_parser_model(
tok2vec: Model,
nr_feature_tokens: StrictInt,
hidden_width: StrictInt,
maxout_pieces: StrictInt,
use_upper=True,
nO=None,
):
nr_feature_tokens: int,
hidden_width: int,
maxout_pieces: int,
use_upper: bool = True,
nO: Optional[int] = None,
) -> Model:
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),)
tok2vec.set_dim("nO", hidden_width)
lower = PrecomputableAffine(
nO=hidden_width if use_upper else nO,
nF=nr_feature_tokens,

View File

@ -26,7 +26,6 @@ def BiluoTagger(
with_array(softmax_activation()),
padded2list(),
)
return Model(
"biluo-tagger",
forward,
@ -52,7 +51,6 @@ def IOBTagger(
with_array(softmax_activation()),
padded2list(),
)
return Model(
"iob-tagger",
forward,

View File

@ -1,10 +1,11 @@
from typing import Optional
from thinc.api import zero_init, with_array, Softmax, chain, Model
from ...util import registry
@registry.architectures.register("spacy.Tagger.v1")
def build_tagger_model(tok2vec, nO=None) -> Model:
def build_tagger_model(tok2vec: Model, nO: Optional[int] = None) -> Model:
# TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
output_layer = Softmax(nO, t2v_width, init_W=zero_init)

View File

@ -2,10 +2,9 @@ from typing import Optional
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
from thinc.api import Relu, residual, expand_window, FeatureExtractor
from ... import util
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
from ...util import registry
from ..extract_ngrams import extract_ngrams
@ -40,7 +39,12 @@ def build_simple_cnn_text_classifier(
@registry.architectures.register("spacy.TextCatBOW.v1")
def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO=None):
def build_bow_text_classifier(
exclusive_classes: bool,
ngram_size: int,
no_output_layer: bool,
nO: Optional[int] = None,
) -> Model:
with Model.define_operators({">>": chain}):
sparse_linear = SparseLinear(nO)
model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
@ -55,16 +59,16 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO
@registry.architectures.register("spacy.TextCatEnsemble.v1")
def build_text_classifier(
width,
embed_size,
pretrained_vectors,
exclusive_classes,
ngram_size,
window_size,
conv_depth,
dropout,
nO=None,
):
width: int,
embed_size: int,
pretrained_vectors: Optional[bool],
exclusive_classes: bool,
ngram_size: int,
window_size: int,
conv_depth: int,
dropout: Optional[float],
nO: Optional[int] = None,
) -> Model:
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
lower = HashEmbed(
@ -91,7 +95,6 @@ def build_text_classifier(
dropout=dropout,
seed=13,
)
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
trained_vectors = FeatureExtractor(cols) >> with_array(
uniqued(
@ -100,7 +103,6 @@ def build_text_classifier(
column=cols.index(ORTH),
)
)
if pretrained_vectors:
static_vectors = StaticVectors(width)
vector_layer = trained_vectors | static_vectors
@ -152,7 +154,12 @@ def build_text_classifier(
@registry.architectures.register("spacy.TextCatLowData.v1")
def build_text_classifier_lowdata(width, pretrained_vectors, dropout, nO=None):
def build_text_classifier_lowdata(
width: int,
pretrained_vectors: Optional[bool],
dropout: Optional[float],
nO: Optional[int] = None,
) -> Model:
# Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
with Model.define_operators({">>": chain, "**": clone}):
model = (

View File

@ -6,16 +6,15 @@ from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
from thinc.types import Floats2d
from ...tokens import Doc
from ... import util
from ...util import registry
from ...ml import _character_embed
from ..staticvectors import StaticVectors
from ...pipeline.tok2vec import Tok2VecListener
from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
@registry.architectures.register("spacy.Tok2VecListener.v1")
def tok2vec_listener_v1(width, upstream="*"):
def tok2vec_listener_v1(width: int, upstream: str = "*"):
tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
return tok2vec
@ -45,10 +44,11 @@ def build_hash_embed_cnn_tok2vec(
width=width,
depth=depth,
window_size=window_size,
maxout_pieces=maxout_pieces
)
maxout_pieces=maxout_pieces,
),
)
@registry.architectures.register("spacy.Tok2Vec.v1")
def build_Tok2Vec_model(
embed: Model[List[Doc], List[Floats2d]],
@ -68,7 +68,6 @@ def MultiHashEmbed(
width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
):
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
seed = 7
def make_hash_embed(feature):
@ -124,11 +123,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
chain(
FeatureExtractor([NORM]),
list2ragged(),
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5))
)
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
),
),
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
ragged2list()
ragged2list(),
)
return model
@ -155,12 +154,7 @@ def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth:
def MishWindowEncoder(width, window_size, depth):
cnn = chain(
expand_window(window_size=window_size),
Mish(
nO=width,
nI=width * ((window_size * 2) + 1),
dropout=0.0,
normalize=True
),
Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
)
model = clone(residual(cnn), depth)
model.set_dim("nO", width)

View File

@ -7,7 +7,7 @@ import importlib.util
import re
from pathlib import Path
import thinc
from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer, Model
from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
import functools
import itertools
import numpy.random
@ -24,8 +24,6 @@ import tempfile
import shutil
import shlex
import inspect
from thinc.types import Unserializable
try:
import cupy.random

View File

@ -6,6 +6,7 @@ menu:
- ['Tok2Vec', 'tok2vec']
- ['Transformers', 'transformers']
- ['Parser & NER', 'parser']
- ['Tagging', 'tagger']
- ['Text Classification', 'textcat']
- ['Entity Linking', 'entitylinker']
---
@ -18,6 +19,30 @@ TODO: intro and how architectures work, link to
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN}
<!-- TODO: intro -->
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.HashEmbedCNN.v1"
> # TODO: ...
>
> [model.tok2vec]
> # ...
> ```
| Name | Type | Description |
| -------------------- | ----- | ----------- |
| `width` | int | |
| `depth` | int | |
| `embed_size` | int | |
| `window_size` | int | |
| `maxout_pieces` | int | |
| `subword_features` | bool | |
| `dropout` | float | |
| `pretrained_vectors` | bool | |
### spacy.HashCharEmbedCNN.v1 {#HashCharEmbedCNN}
### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM}
@ -99,6 +124,28 @@ architectures into your training config.
| `use_upper` | bool | |
| `nO` | int | |
## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
### spacy.Tagger.v1 {#Tagger}
<!-- TODO: intro -->
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.Tagger.v1"
> nO = null
>
> [model.tok2vec]
> # ...
> ```
| Name | Type | Description |
| --------- | ------------------------------------------ | ----------- |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | |
| `nO` | int | |
## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"}
### spacy.TextCatEnsemble.v1 {#TextCatEnsemble}
@ -112,3 +159,21 @@ architectures into your training config.
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
### spacy.EntityLinker.v1 {#EntityLinker}
<!-- TODO: intro -->
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.EntityLinker.v1"
> nO = null
>
> [model.tok2vec]
> # ...
> ```
| Name | Type | Description |
| --------- | ------------------------------------------ | ----------- |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | |
| `nO` | int | |

View File

@ -29,9 +29,11 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("parser", config=config)
> ```
<!-- TODO: finish API docs -->
| Setting | Type | Description | Default |
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
| `moves` | list | <!-- TODO: --> | `None` |
| `moves` | list | | `None` |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
```python
@ -59,17 +61,19 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
<!-- TODO: finish API docs -->
| Name | Type | Description |
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| `moves` | list | <!-- TODO: --> |
| `moves` | list | |
| _keyword-only_ | | |
| `update_with_oracle_cut_size` | int | <!-- TODO: --> |
| `multitasks` | `Iterable` | <!-- TODO: --> |
| `learn_tokens` | bool | <!-- TODO: --> |
| `min_action_freq` | int | <!-- TODO: --> |
| `update_with_oracle_cut_size` | int | |
| `multitasks` | `Iterable` | |
| `learn_tokens` | bool | |
| `min_action_freq` | int | |
## DependencyParser.\_\_call\_\_ {#call tag="method"}

View File

@ -32,12 +32,14 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("entity_linker", config=config)
> ```
<!-- TODO: finish API docs -->
| Setting | Type | Description | Default |
| ---------------- | ------------------------------------------ | ----------------- | ----------------------------------------------- |
| `kb` | `KnowledgeBase` | <!-- TODO: --> | `None` |
| `labels_discard` | `Iterable[str]` | <!-- TODO: --> | `[]` |
| `incl_prior` | bool | <!-- TODO: --> |  `True` |
| `incl_context` | bool | <!-- TODO: --> | `True` |
| `kb` | `KnowledgeBase` | | `None` |
| `labels_discard` | `Iterable[str]` | | `[]` |
| `incl_prior` | bool | |  `True` |
| `incl_context` | bool | | `True` |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) |
```python
@ -65,16 +67,18 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
<!-- TODO: finish API docs -->
| Name | Type | Description |
| ---------------- | --------------- | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| _keyword-only_ | | |
| `kb` | `KnowlegeBase` | <!-- TODO: --> |
| `labels_discard` | `Iterable[str]` | <!-- TODO: --> |
| `incl_prior` | bool | <!-- TODO: --> |
| `incl_context` | bool | <!-- TODO: --> |
| `kb` | `KnowlegeBase` | |
| `labels_discard` | `Iterable[str]` | |
| `incl_prior` | bool | |
| `incl_context` | bool | |
## EntityLinker.\_\_call\_\_ {#call tag="method"}

View File

@ -29,9 +29,11 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("ner", config=config)
> ```
<!-- TODO: finish API docs -->
| Setting | Type | Description | Default |
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
| `moves` | list | <!-- TODO: --> | `None` |
| `moves` | list | | `None` |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
```python
@ -59,17 +61,19 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
<!-- TODO: finish API docs -->
| Name | Type | Description |
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| `moves` | list | <!-- TODO: --> |
| `moves` | list | |
| _keyword-only_ | | |
| `update_with_oracle_cut_size` | int | <!-- TODO: --> |
| `multitasks` | `Iterable` | <!-- TODO: --> |
| `learn_tokens` | bool | <!-- TODO: --> |
| `min_action_freq` | int | <!-- TODO: --> |
| `update_with_oracle_cut_size` | int | |
| `multitasks` | `Iterable` | |
| `learn_tokens` | bool | |
| `min_action_freq` | int | |
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}

View File

@ -8,9 +8,8 @@ new: 3.0
An `Example` holds the information for one training instance. It stores two
`Doc` objects: one for holding the gold-standard reference data, and one for
holding the predictions of the pipeline. An `Alignment` <!-- TODO: link? -->
object stores the alignment between these two documents, as they can differ in
tokenization.
holding the predictions of the pipeline. An `Alignment` object stores the
alignment between these two documents, as they can differ in tokenization.
## Example.\_\_init\_\_ {#init tag="method"}

View File

@ -98,9 +98,9 @@ decorator. For more details and examples, see the
| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | The name of the component factory. |
| _keyword-only_ | | |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. |
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
@ -146,9 +146,9 @@ examples, see the
| `name` | str | The name of the component factory. |
| _keyword-only_ | | |
| `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. |
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
@ -833,8 +833,8 @@ instance and factory instance.
| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `factory` | str | The name of the registered component factory. |
| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->  |
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->  |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.  |
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis.  |
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |

View File

@ -63,14 +63,16 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
<!-- TODO: finish API docs -->
| Name | Type | Description |
| -------------- | ------- | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| _keyword-only_ | | |
| `labels_morph` | dict | <!-- TODO: --> |
| `labels_pos` | dict | <!-- TODO: --> |
| `labels_morph` | dict | |
| `labels_pos` | dict | |
## Morphologizer.\_\_call\_\_ {#call tag="method"}

View File

@ -290,6 +290,8 @@ factories.
> return Model("custom", forward, dims={"nO": nO})
> ```
<!-- TODO: finish table -->
| Registry name | Description |
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
@ -297,7 +299,7 @@ factories.
| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). |
| `lookups` | Registry for large lookup tables available via `vocab.lookups`. |
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
| `assets` | <!-- TODO: what is this used for again?--> |
| `assets` | |
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). |

View File

@ -347,50 +347,52 @@ serialization by passing in the string names via the `exclude` argument.
Transformer tokens and outputs for one `Doc` object.
| Name | Type | Description |
| --------- | -------------------------------------------------- | ----------------------------------------- |
| `tokens` | `Dict` | <!-- TODO: --> |
| `tensors` | `List[FloatsXd]` | <!-- TODO: --> |
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | <!-- TODO: --> |
| `width` | int | <!-- TODO: also mention it's property --> |
<!-- TODO: finish API docs, also mention "width" is property -->
| Name | Type | Description |
| --------- | -------------------------------------------------- | ----------- |
| `tokens` | `Dict` | |
| `tensors` | `List[FloatsXd]` | |
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | |
| `width` | int | |
### TransformerData.empty {#transformerdata-emoty tag="classmethod"}
<!-- TODO: -->
<!-- TODO: finish API docs -->
| Name | Type | Description |
| ----------- | ----------------- | -------------- |
| **RETURNS** | `TransformerData` | <!-- TODO: --> |
| Name | Type | Description |
| ----------- | ----------------- | ----------- |
| **RETURNS** | `TransformerData` | |
## FullTransformerBatch {#fulltransformerbatch tag="dataclass"}
<!-- TODO: -->
<!-- TODO: write, also mention doc_data is property -->
| Name | Type | Description |
| ---------- | -------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------- |
| `spans` | `List[List[Span]]` | <!-- TODO: --> |
| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | <!-- TODO: --> |
| `tensors` | `List[torch.Tensor]` | <!-- TODO: --> |
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | <!-- TODO: --> |
| `doc_data` | `List[TransformerData]` | <!-- TODO: also mention it's property --> |
| Name | Type | Description |
| ---------- | -------------------------------------------------------------------------------------------------------------------------- | ----------- |
| `spans` | `List[List[Span]]` | |
| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | |
| `tensors` | `List[torch.Tensor]` | |
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | |
| `doc_data` | `List[TransformerData]` | |
### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"}
<!-- TODO: -->
<!-- TODO: write -->
| Name | Type | Description |
| ----------- | ---------------------- | -------------- |
| `arrays` | `List[List[Floats3d]]` | <!-- TODO: --> |
| **RETURNS** | `FullTransformerBatch` | <!-- TODO: --> |
| Name | Type | Description |
| ----------- | ---------------------- | ----------- |
| `arrays` | `List[List[Floats3d]]` | |
| **RETURNS** | `FullTransformerBatch` | |
### FullTransformerBatch.split_by_doc {#fulltransformerbatch-split_by_doc tag="method"}
Split a `TransformerData` object that represents a batch into a list with one
`TransformerData` per `Doc`.
| Name | Type | Description |
| ----------- | ----------------------- | -------------- |
| **RETURNS** | `List[TransformerData]` | <!-- TODO: --> |
| Name | Type | Description |
| ----------- | ----------------------- | ----------- |
| **RETURNS** | `List[TransformerData]` | |
## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
@ -421,11 +423,13 @@ getters using the `@registry.span_getters` decorator.
The following built-in functions are available:
<!-- TODO: finish API docs -->
| Name | Description |
| ------------------ | ------------------------------------------------------------------ |
| `doc_spans.v1` | Create a span for each doc (no transformation, process each text). |
| `sent_spans.v1` | Create a span for each sentence if sentence boundaries are set. |
| `strided_spans.v1` | <!-- TODO: --> |
| `strided_spans.v1` | |
## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}

View File

@ -231,10 +231,10 @@ available pipeline components and component functions.
| `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. |
| `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. |
| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. |
| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | <!-- TODO: --> |
| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | |
| `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. |
<!-- TODO: update with more components -->
<!-- TODO: finish and update with more components -->
<!-- TODO: explain default config and factories -->

View File

@ -15,8 +15,6 @@ import Serialization101 from 'usage/101/\_serialization.md'
### Serializing the pipeline {#pipeline}
<!-- TODO: update this -->
When serializing the pipeline, keep in mind that this will only save out the
**binary data for the individual components** to allow spaCy to restore them
not the entire objects. This is a good thing, because it makes serialization