diff --git a/spacy/ml/_character_embed.py b/spacy/ml/_character_embed.py index ab0cb85c7..f5c539c42 100644 --- a/spacy/ml/_character_embed.py +++ b/spacy/ml/_character_embed.py @@ -1,6 +1,7 @@ from typing import List from thinc.api import Model from thinc.types import Floats2d + from ..tokens import Doc @@ -15,14 +16,14 @@ def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]: ) -def init(model, X=None, Y=None): +def init(model: Model, X=None, Y=None): vectors_table = model.ops.alloc3f( model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM") ) model.set_param("E", vectors_table) -def forward(model, docs, is_train): +def forward(model: Model, docs: List[Doc], is_train: bool): if docs is None: return [] ids = [] diff --git a/spacy/ml/_iob.py b/spacy/ml/_iob.py index 9f385ec0d..4dbc79f52 100644 --- a/spacy/ml/_iob.py +++ b/spacy/ml/_iob.py @@ -14,7 +14,7 @@ def IOB() -> Model[Padded, Padded]: ) -def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None): +def init(model: Model, X: Optional[Padded] = None, Y: Optional[Padded] = None) -> None: if X is not None and Y is not None: if X.data.shape != Y.data.shape: # TODO: Fix error diff --git a/spacy/ml/extract_ngrams.py b/spacy/ml/extract_ngrams.py index f9f691aae..bdc297232 100644 --- a/spacy/ml/extract_ngrams.py +++ b/spacy/ml/extract_ngrams.py @@ -4,14 +4,14 @@ from thinc.api import Model from ..attrs import LOWER -def extract_ngrams(ngram_size, attr=LOWER) -> Model: +def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model: model = Model("extract_ngrams", forward) model.attrs["ngram_size"] = ngram_size model.attrs["attr"] = attr return model -def forward(model, docs, is_train: bool): +def forward(model: Model, docs, is_train: bool): batch_keys = [] batch_vals = [] for doc in docs: diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index ffd6c3c1c..f61fe2d5f 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -1,5 +1,4 @@ -from pathlib import Path - +from typing import Optional from thinc.api import chain, clone, list2ragged, reduce_mean, residual from thinc.api import Model, Maxout, Linear @@ -9,7 +8,7 @@ from ...vocab import Vocab @registry.architectures.register("spacy.EntityLinker.v1") -def build_nel_encoder(tok2vec, nO=None): +def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model: with Model.define_operators({">>": chain, "**": clone}): token_width = tok2vec.get_dim("nO") output_layer = Linear(nO=nO, nI=token_width) @@ -26,7 +25,7 @@ def build_nel_encoder(tok2vec, nO=None): @registry.assets.register("spacy.KBFromFile.v1") -def load_kb(vocab_path, kb_path) -> KnowledgeBase: +def load_kb(vocab_path: str, kb_path: str) -> KnowledgeBase: vocab = Vocab().from_disk(vocab_path) kb = KnowledgeBase(vocab=vocab) kb.load_bulk(kb_path) diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index ed85b1a91..ac990c015 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -1,10 +1,20 @@ +from typing import Optional, Iterable, Tuple, List, TYPE_CHECKING import numpy - from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model from thinc.api import MultiSoftmax, list2array +if TYPE_CHECKING: + # This lets us add type hints for mypy etc. without causing circular imports + from ...vocab import Vocab # noqa: F401 + from ...tokens import Doc # noqa: F401 -def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None): + +def build_multi_task_model( + tok2vec: Model, + maxout_pieces: int, + token_vector_width: int, + nO: Optional[int] = None, +) -> Model: softmax = Softmax(nO=nO, nI=token_vector_width * 2) model = chain( tok2vec, @@ -22,7 +32,13 @@ def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None): return model -def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, hidden_size, nO=None): +def build_cloze_multi_task_model( + vocab: "Vocab", + tok2vec: Model, + maxout_pieces: int, + hidden_size: int, + nO: Optional[int] = None, +) -> Model: # nO = vocab.vectors.data.shape[1] output_layer = chain( list2array(), @@ -43,24 +59,24 @@ def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, hidden_size, nO= def build_cloze_characters_multi_task_model( - vocab, tok2vec, maxout_pieces, hidden_size, nr_char -): + vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int, nr_char: int +) -> Model: output_layer = chain( list2array(), Maxout(hidden_size, nP=maxout_pieces), LayerNorm(nI=hidden_size), MultiSoftmax([256] * nr_char, nI=hidden_size), ) - model = build_masked_language_model(vocab, chain(tok2vec, output_layer)) model.set_ref("tok2vec", tok2vec) model.set_ref("output_layer", output_layer) return model -def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15): +def build_masked_language_model( + vocab: "Vocab", wrapped_model: Model, mask_prob: float = 0.15 +) -> Model: """Convert a model into a BERT-style masked language model""" - random_words = _RandomWords(vocab) def mlm_forward(model, docs, is_train): @@ -74,7 +90,7 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15): return output, mlm_backward - def mlm_initialize(model, X=None, Y=None): + def mlm_initialize(model: Model, X=None, Y=None): wrapped = model.layers[0] wrapped.initialize(X=X, Y=Y) for dim in wrapped.dim_names: @@ -90,12 +106,11 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15): dims={dim: None for dim in wrapped_model.dim_names}, ) mlm_model.set_ref("wrapped", wrapped_model) - return mlm_model class _RandomWords: - def __init__(self, vocab): + def __init__(self, vocab: "Vocab") -> None: self.words = [lex.text for lex in vocab if lex.prob != 0.0] self.probs = [lex.prob for lex in vocab if lex.prob != 0.0] self.words = self.words[:10000] @@ -104,7 +119,7 @@ class _RandomWords: self.probs /= self.probs.sum() self._cache = [] - def next(self): + def next(self) -> str: if not self._cache: self._cache.extend( numpy.random.choice(len(self.words), 10000, p=self.probs) @@ -113,9 +128,11 @@ class _RandomWords: return self.words[index] -def _apply_mask(docs, random_words, mask_prob=0.15): +def _apply_mask( + docs: Iterable["Doc"], random_words: _RandomWords, mask_prob: float = 0.15 +) -> Tuple[numpy.ndarray, List["Doc"]]: # This needs to be here to avoid circular imports - from ...tokens import Doc + from ...tokens import Doc # noqa: F811 N = sum(len(doc) for doc in docs) mask = numpy.random.uniform(0.0, 1.0, (N,)) @@ -141,7 +158,7 @@ def _apply_mask(docs, random_words, mask_prob=0.15): return mask, masked_docs -def _replace_word(word, random_words, mask="[MASK]"): +def _replace_word(word: str, random_words: _RandomWords, mask: str = "[MASK]") -> str: roll = numpy.random.random() if roll < 0.8: return mask diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index c1e530d4a..429ceff28 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -1,6 +1,5 @@ -from pydantic import StrictInt -from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops, with_array -from thinc.api import LayerNorm, Maxout, Mish +from typing import Optional +from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops from ...util import registry from .._precomputable_affine import PrecomputableAffine @@ -10,16 +9,15 @@ from ..tb_framework import TransitionModel @registry.architectures.register("spacy.TransitionBasedParser.v1") def build_tb_parser_model( tok2vec: Model, - nr_feature_tokens: StrictInt, - hidden_width: StrictInt, - maxout_pieces: StrictInt, - use_upper=True, - nO=None, -): + nr_feature_tokens: int, + hidden_width: int, + maxout_pieces: int, + use_upper: bool = True, + nO: Optional[int] = None, +) -> Model: t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),) tok2vec.set_dim("nO", hidden_width) - lower = PrecomputableAffine( nO=hidden_width if use_upper else nO, nF=nr_feature_tokens, diff --git a/spacy/ml/models/simple_ner.py b/spacy/ml/models/simple_ner.py index 1fb5a71c0..b2934dadc 100644 --- a/spacy/ml/models/simple_ner.py +++ b/spacy/ml/models/simple_ner.py @@ -26,7 +26,6 @@ def BiluoTagger( with_array(softmax_activation()), padded2list(), ) - return Model( "biluo-tagger", forward, @@ -52,7 +51,6 @@ def IOBTagger( with_array(softmax_activation()), padded2list(), ) - return Model( "iob-tagger", forward, diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py index 7fe417321..78637e8b5 100644 --- a/spacy/ml/models/tagger.py +++ b/spacy/ml/models/tagger.py @@ -1,10 +1,11 @@ +from typing import Optional from thinc.api import zero_init, with_array, Softmax, chain, Model from ...util import registry @registry.architectures.register("spacy.Tagger.v1") -def build_tagger_model(tok2vec, nO=None) -> Model: +def build_tagger_model(tok2vec: Model, nO: Optional[int] = None) -> Model: # TODO: glorot_uniform_init seems to work a bit better than zero_init here?! t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None output_layer = Softmax(nO, t2v_width, init_W=zero_init) diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 53200c165..0a25699dc 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -2,10 +2,9 @@ from typing import Optional from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum -from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued +from thinc.api import HashEmbed, with_array, with_cpu, uniqued from thinc.api import Relu, residual, expand_window, FeatureExtractor -from ... import util from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER from ...util import registry from ..extract_ngrams import extract_ngrams @@ -40,7 +39,12 @@ def build_simple_cnn_text_classifier( @registry.architectures.register("spacy.TextCatBOW.v1") -def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO=None): +def build_bow_text_classifier( + exclusive_classes: bool, + ngram_size: int, + no_output_layer: bool, + nO: Optional[int] = None, +) -> Model: with Model.define_operators({">>": chain}): sparse_linear = SparseLinear(nO) model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear @@ -55,16 +59,16 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO @registry.architectures.register("spacy.TextCatEnsemble.v1") def build_text_classifier( - width, - embed_size, - pretrained_vectors, - exclusive_classes, - ngram_size, - window_size, - conv_depth, - dropout, - nO=None, -): + width: int, + embed_size: int, + pretrained_vectors: Optional[bool], + exclusive_classes: bool, + ngram_size: int, + window_size: int, + conv_depth: int, + dropout: Optional[float], + nO: Optional[int] = None, +) -> Model: cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): lower = HashEmbed( @@ -91,7 +95,6 @@ def build_text_classifier( dropout=dropout, seed=13, ) - width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) trained_vectors = FeatureExtractor(cols) >> with_array( uniqued( @@ -100,7 +103,6 @@ def build_text_classifier( column=cols.index(ORTH), ) ) - if pretrained_vectors: static_vectors = StaticVectors(width) vector_layer = trained_vectors | static_vectors @@ -152,7 +154,12 @@ def build_text_classifier( @registry.architectures.register("spacy.TextCatLowData.v1") -def build_text_classifier_lowdata(width, pretrained_vectors, dropout, nO=None): +def build_text_classifier_lowdata( + width: int, + pretrained_vectors: Optional[bool], + dropout: Optional[float], + nO: Optional[int] = None, +) -> Model: # Note, before v.3, this was the default if setting "low_data" and "pretrained_dims" with Model.define_operators({">>": chain, "**": clone}): model = ( diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 1460b3005..474942558 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -6,16 +6,15 @@ from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM from thinc.types import Floats2d from ...tokens import Doc -from ... import util from ...util import registry from ...ml import _character_embed from ..staticvectors import StaticVectors from ...pipeline.tok2vec import Tok2VecListener -from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE +from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE @registry.architectures.register("spacy.Tok2VecListener.v1") -def tok2vec_listener_v1(width, upstream="*"): +def tok2vec_listener_v1(width: int, upstream: str = "*"): tok2vec = Tok2VecListener(upstream_name=upstream, width=width) return tok2vec @@ -45,10 +44,11 @@ def build_hash_embed_cnn_tok2vec( width=width, depth=depth, window_size=window_size, - maxout_pieces=maxout_pieces - ) + maxout_pieces=maxout_pieces, + ), ) + @registry.architectures.register("spacy.Tok2Vec.v1") def build_Tok2Vec_model( embed: Model[List[Doc], List[Floats2d]], @@ -68,7 +68,6 @@ def MultiHashEmbed( width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool ): cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH] - seed = 7 def make_hash_embed(feature): @@ -124,11 +123,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int): chain( FeatureExtractor([NORM]), list2ragged(), - with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)) - ) + with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), + ), ), with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)), - ragged2list() + ragged2list(), ) return model @@ -155,12 +154,7 @@ def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth: def MishWindowEncoder(width, window_size, depth): cnn = chain( expand_window(window_size=window_size), - Mish( - nO=width, - nI=width * ((window_size * 2) + 1), - dropout=0.0, - normalize=True - ), + Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True), ) model = clone(residual(cnn), depth) model.set_dim("nO", width) diff --git a/spacy/util.py b/spacy/util.py index 677f5e8e0..305a9a535 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -7,7 +7,7 @@ import importlib.util import re from pathlib import Path import thinc -from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer, Model +from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer import functools import itertools import numpy.random @@ -24,8 +24,6 @@ import tempfile import shutil import shlex import inspect -from thinc.types import Unserializable - try: import cupy.random diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 95f7d0597..a22ee5be8 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -6,6 +6,7 @@ menu: - ['Tok2Vec', 'tok2vec'] - ['Transformers', 'transformers'] - ['Parser & NER', 'parser'] + - ['Tagging', 'tagger'] - ['Text Classification', 'textcat'] - ['Entity Linking', 'entitylinker'] --- @@ -18,6 +19,30 @@ TODO: intro and how architectures work, link to ### spacy.HashEmbedCNN.v1 {#HashEmbedCNN} + + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy.HashEmbedCNN.v1" +> # TODO: ... +> +> [model.tok2vec] +> # ... +> ``` + +| Name | Type | Description | +| -------------------- | ----- | ----------- | +| `width` | int | | +| `depth` | int | | +| `embed_size` | int | | +| `window_size` | int | | +| `maxout_pieces` | int | | +| `subword_features` | bool | | +| `dropout` | float | | +| `pretrained_vectors` | bool | | + ### spacy.HashCharEmbedCNN.v1 {#HashCharEmbedCNN} ### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM} @@ -99,6 +124,28 @@ architectures into your training config. | `use_upper` | bool | | | `nO` | int | | +## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"} + +### spacy.Tagger.v1 {#Tagger} + + + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy.Tagger.v1" +> nO = null +> +> [model.tok2vec] +> # ... +> ``` + +| Name | Type | Description | +| --------- | ------------------------------------------ | ----------- | +| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | | +| `nO` | int | | + ## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"} ### spacy.TextCatEnsemble.v1 {#TextCatEnsemble} @@ -112,3 +159,21 @@ architectures into your training config. ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"} ### spacy.EntityLinker.v1 {#EntityLinker} + + + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy.EntityLinker.v1" +> nO = null +> +> [model.tok2vec] +> # ... +> ``` + +| Name | Type | Description | +| --------- | ------------------------------------------ | ----------- | +| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | | +| `nO` | int | | diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index a18e9e582..e56e85e64 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -29,9 +29,11 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("parser", config=config) > ``` + + | Setting | Type | Description | Default | | ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- | -| `moves` | list | | `None` | +| `moves` | list | | `None` | | `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) | ```python @@ -59,17 +61,19 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). + + | Name | Type | Description | | ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- | | `vocab` | `Vocab` | The shared vocabulary. | | `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| `moves` | list | | +| `moves` | list | | | _keyword-only_ | | | -| `update_with_oracle_cut_size` | int | | -| `multitasks` | `Iterable` | | -| `learn_tokens` | bool | | -| `min_action_freq` | int | | +| `update_with_oracle_cut_size` | int | | +| `multitasks` | `Iterable` | | +| `learn_tokens` | bool | | +| `min_action_freq` | int | | ## DependencyParser.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 2a1ba94d2..1e9beaf82 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -32,12 +32,14 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("entity_linker", config=config) > ``` + + | Setting | Type | Description | Default | | ---------------- | ------------------------------------------ | ----------------- | ----------------------------------------------- | -| `kb` | `KnowledgeBase` | | `None` | -| `labels_discard` | `Iterable[str]` | | `[]` | -| `incl_prior` | bool | |  `True` | -| `incl_context` | bool | | `True` | +| `kb` | `KnowledgeBase` | | `None` | +| `labels_discard` | `Iterable[str]` | | `[]` | +| `incl_prior` | bool | |  `True` | +| `incl_context` | bool | | `True` | | `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) | ```python @@ -65,16 +67,18 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). + + | Name | Type | Description | | ---------------- | --------------- | ------------------------------------------------------------------------------------------- | | `vocab` | `Vocab` | The shared vocabulary. | | `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | _keyword-only_ | | | -| `kb` | `KnowlegeBase` | | -| `labels_discard` | `Iterable[str]` | | -| `incl_prior` | bool | | -| `incl_context` | bool | | +| `kb` | `KnowlegeBase` | | +| `labels_discard` | `Iterable[str]` | | +| `incl_prior` | bool | | +| `incl_context` | bool | | ## EntityLinker.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index b5b549a04..0ab17f953 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -29,9 +29,11 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("ner", config=config) > ``` + + | Setting | Type | Description | Default | | ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- | -| `moves` | list | | `None` | +| `moves` | list | | `None` | | `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) | ```python @@ -59,17 +61,19 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). + + | Name | Type | Description | | ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- | | `vocab` | `Vocab` | The shared vocabulary. | | `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| `moves` | list | | +| `moves` | list | | | _keyword-only_ | | | -| `update_with_oracle_cut_size` | int | | -| `multitasks` | `Iterable` | | -| `learn_tokens` | bool | | -| `min_action_freq` | int | | +| `update_with_oracle_cut_size` | int | | +| `multitasks` | `Iterable` | | +| `learn_tokens` | bool | | +| `min_action_freq` | int | | ## EntityRecognizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/example.md b/website/docs/api/example.md index e6299fc31..1257fdc1e 100644 --- a/website/docs/api/example.md +++ b/website/docs/api/example.md @@ -8,9 +8,8 @@ new: 3.0 An `Example` holds the information for one training instance. It stores two `Doc` objects: one for holding the gold-standard reference data, and one for -holding the predictions of the pipeline. An `Alignment` -object stores the alignment between these two documents, as they can differ in -tokenization. +holding the predictions of the pipeline. An `Alignment` object stores the +alignment between these two documents, as they can differ in tokenization. ## Example.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 377852a69..0662fb12a 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -98,9 +98,9 @@ decorator. For more details and examples, see the | ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `name` | str | The name of the component factory. | | _keyword-only_ | | | -| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. | +| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | +| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | +| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. | | `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. | | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | | `func` | `Optional[Callable]` | Optional function if not used a a decorator. | @@ -146,9 +146,9 @@ examples, see the | `name` | str | The name of the component factory. | | _keyword-only_ | | | | `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. | -| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. | +| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | +| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | +| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. | | `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. | | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | | `func` | `Optional[Callable]` | Optional function if not used a a decorator. | @@ -833,8 +833,8 @@ instance and factory instance. | ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `factory` | str | The name of the registered component factory. | | `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. | -| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.   | -| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis.   | +| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | +| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.  | +| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis.  | | `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. | | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index ac7146543..bfe5c3c77 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -63,14 +63,16 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). + + | Name | Type | Description | | -------------- | ------- | ------------------------------------------------------------------------------------------- | | `vocab` | `Vocab` | The shared vocabulary. | | `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | _keyword-only_ | | | -| `labels_morph` | dict | | -| `labels_pos` | dict | | +| `labels_morph` | dict | | +| `labels_pos` | dict | | ## Morphologizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index ede7f9e21..368b58a9b 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -290,6 +290,8 @@ factories. > return Model("custom", forward, dims={"nO": nO}) > ``` + + | Registry name | Description | | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. | @@ -297,7 +299,7 @@ factories. | `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). | | `lookups` | Registry for large lookup tables available via `vocab.lookups`. | | `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). | -| `assets` | | +| `assets` | | | `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). | | `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). | | `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). | diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index 70128d225..6b6be6bd0 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -347,50 +347,52 @@ serialization by passing in the string names via the `exclude` argument. Transformer tokens and outputs for one `Doc` object. -| Name | Type | Description | -| --------- | -------------------------------------------------- | ----------------------------------------- | -| `tokens` | `Dict` | | -| `tensors` | `List[FloatsXd]` | | -| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | | -| `width` | int | | + + +| Name | Type | Description | +| --------- | -------------------------------------------------- | ----------- | +| `tokens` | `Dict` | | +| `tensors` | `List[FloatsXd]` | | +| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | | +| `width` | int | | ### TransformerData.empty {#transformerdata-emoty tag="classmethod"} - + -| Name | Type | Description | -| ----------- | ----------------- | -------------- | -| **RETURNS** | `TransformerData` | | +| Name | Type | Description | +| ----------- | ----------------- | ----------- | +| **RETURNS** | `TransformerData` | | ## FullTransformerBatch {#fulltransformerbatch tag="dataclass"} - + -| Name | Type | Description | -| ---------- | -------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------- | -| `spans` | `List[List[Span]]` | | -| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | | -| `tensors` | `List[torch.Tensor]` | | -| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | | -| `doc_data` | `List[TransformerData]` | | +| Name | Type | Description | +| ---------- | -------------------------------------------------------------------------------------------------------------------------- | ----------- | +| `spans` | `List[List[Span]]` | | +| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | | +| `tensors` | `List[torch.Tensor]` | | +| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | | +| `doc_data` | `List[TransformerData]` | | ### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"} - + -| Name | Type | Description | -| ----------- | ---------------------- | -------------- | -| `arrays` | `List[List[Floats3d]]` | | -| **RETURNS** | `FullTransformerBatch` | | +| Name | Type | Description | +| ----------- | ---------------------- | ----------- | +| `arrays` | `List[List[Floats3d]]` | | +| **RETURNS** | `FullTransformerBatch` | | ### FullTransformerBatch.split_by_doc {#fulltransformerbatch-split_by_doc tag="method"} Split a `TransformerData` object that represents a batch into a list with one `TransformerData` per `Doc`. -| Name | Type | Description | -| ----------- | ----------------------- | -------------- | -| **RETURNS** | `List[TransformerData]` | | +| Name | Type | Description | +| ----------- | ----------------------- | ----------- | +| **RETURNS** | `List[TransformerData]` | | ## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"} @@ -421,11 +423,13 @@ getters using the `@registry.span_getters` decorator. The following built-in functions are available: + + | Name | Description | | ------------------ | ------------------------------------------------------------------ | | `doc_spans.v1` | Create a span for each doc (no transformation, process each text). | | `sent_spans.v1` | Create a span for each sentence if sentence boundaries are set. | -| `strided_spans.v1` | | +| `strided_spans.v1` | | ## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"} diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 2bdd560da..486cef1be 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -231,10 +231,10 @@ available pipeline components and component functions. | `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. | | `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. | | `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. | -| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | | +| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | | | `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. | - + diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index cdd7d1c49..904477733 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -15,8 +15,6 @@ import Serialization101 from 'usage/101/\_serialization.md' ### Serializing the pipeline {#pipeline} - - When serializing the pipeline, keep in mind that this will only save out the **binary data for the individual components** to allow spaCy to restore them – not the entire objects. This is a good thing, because it makes serialization