From 82f0e20318dc43cb5463b53d046694c11fb069cc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 18 Aug 2020 14:39:40 +0200 Subject: [PATCH] Update docs and consistency [ci skip] --- spacy/cli/package.py | 1 + spacy/schemas.py | 26 +- website/docs/api/architectures.md | 134 +++++--- website/docs/api/cli.md | 12 +- website/docs/api/data-formats.md | 163 +++++++--- website/docs/api/language.md | 14 +- website/docs/api/tok2vec.md | 2 +- website/docs/api/top-level.md | 21 +- website/docs/images/tok2vec-listener.svg | 41 +++ website/docs/images/tok2vec.svg | 17 + website/docs/usage/embeddings-transformers.md | 34 +- website/docs/usage/models.md | 17 +- website/docs/usage/processing-pipelines.md | 2 +- website/docs/usage/rule-based-matching.md | 11 +- website/docs/usage/saving-loading.md | 50 ++- website/docs/usage/training.md | 66 ++-- website/docs/usage/transformers.md | 305 ------------------ website/docs/usage/v3.md | 6 + website/meta/type-annotations.json | 1 + website/src/components/icon.js | 11 +- website/src/components/table.js | 28 +- website/src/widgets/quickstart-training.js | 13 +- 22 files changed, 468 insertions(+), 507 deletions(-) create mode 100644 website/docs/images/tok2vec-listener.svg create mode 100644 website/docs/images/tok2vec.svg delete mode 100644 website/docs/usage/transformers.md diff --git a/spacy/cli/package.py b/spacy/cli/package.py index a1162f3e1..523e8a99a 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -229,6 +229,7 @@ if __name__ == '__main__': TEMPLATE_MANIFEST = """ include meta.json +include config.cfg """.strip() diff --git a/spacy/schemas.py b/spacy/schemas.py index 0f2a35c60..e219c2009 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -167,18 +167,20 @@ class ModelMetaSchema(BaseModel): lang: StrictStr = Field(..., title="Two-letter language code, e.g. 'en'") name: StrictStr = Field(..., title="Model name") version: StrictStr = Field(..., title="Model version") - spacy_version: Optional[StrictStr] = Field(None, title="Compatible spaCy version identifier") - parent_package: Optional[StrictStr] = Field("spacy", title="Name of parent spaCy package, e.g. spacy or spacy-nightly") - pipeline: Optional[List[StrictStr]] = Field([], title="Names of pipeline components") - description: Optional[StrictStr] = Field(None, title="Model description") - license: Optional[StrictStr] = Field(None, title="Model license") - author: Optional[StrictStr] = Field(None, title="Model author name") - email: Optional[StrictStr] = Field(None, title="Model author email") - url: Optional[StrictStr] = Field(None, title="Model author URL") - sources: Optional[Union[List[StrictStr], Dict[str, str]]] = Field(None, title="Training data sources") - vectors: Optional[Dict[str, Any]] = Field(None, title="Included word vectors") - accuracy: Optional[Dict[str, Union[float, int]]] = Field(None, title="Accuracy numbers") - speed: Optional[Dict[str, Union[float, int]]] = Field(None, title="Speed evaluation numbers") + spacy_version: StrictStr = Field("", title="Compatible spaCy version identifier") + parent_package: StrictStr = Field("spacy", title="Name of parent spaCy package, e.g. spacy or spacy-nightly") + pipeline: List[StrictStr] = Field([], title="Names of pipeline components") + description: StrictStr = Field("", title="Model description") + license: StrictStr = Field("", title="Model license") + author: StrictStr = Field("", title="Model author name") + email: StrictStr = Field("", title="Model author email") + url: StrictStr = Field("", title="Model author URL") + sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources") + vectors: Dict[str, Any] = Field({}, title="Included word vectors") + labels: Dict[str, Dict[str, List[str]]] = Field({}, title="Component labels, keyed by component name") + accuracy: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy numbers") + speed: Dict[str, Union[float, int]] = Field({}, title="Speed evaluation numbers") + spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used") # fmt: on diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 8bb5cdeea..737ca2fa2 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -33,7 +33,7 @@ TODO: intro and how architectures work, link to > subword_features = true > ``` -Build spaCy's "standard" tok2vec layer, which uses hash embedding with subword +Build spaCy's "standard" embedding layer, which uses hash embedding with subword features and a CNN with layer-normalized maxout. | Name | Description | @@ -45,6 +45,7 @@ features and a CNN with layer-normalized maxout. | `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ | | `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ | | `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ### spacy.Tok2Vec.v1 {#Tok2Vec} @@ -67,10 +68,11 @@ Construct a tok2vec model out of embedding and encoding subnetworks. See the ["Embed, Encode, Attend, Predict"](https://explosion.ai/blog/deep-learning-formula-nlp) blog post for background. -| Name | Description | -| -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `embed` | Embed tokens into context-independent word vector representations. For example, [CharacterEmbed](/api/architectures#CharacterEmbed) or [MultiHashEmbed](/api/architectures#MultiHashEmbed). ~~Model[List[Doc], List[Floats2d]]~~ | -| `encode` | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder). ~~Model[List[Floats2d], List[Floats2d]]~~ | +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `embed` | Embed tokens into context-independent word vector representations. For example, [CharacterEmbed](/api/architectures#CharacterEmbed) or [MultiHashEmbed](/api/architectures#MultiHashEmbed). ~~Model[List[Doc], List[Floats2d]]~~ | +| `encode` | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder). ~~Model[List[Floats2d], List[Floats2d]]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ### spacy.Tok2VecListener.v1 {#Tok2VecListener} @@ -108,10 +110,13 @@ Instead of defining its own `Tok2Vec` instance, a model architecture like [Tagger](/api/architectures#tagger) can define a listener as its `tok2vec` argument that connects to the shared `tok2vec` component in the pipeline. -| Name | Description | -| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `width` | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~ | -| `upstream` | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ | + + +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `width` | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~ | +| `upstream` | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ | +| **CREATES** | The model using the architecture. ~~Model~~ | ### spacy.MultiHashEmbed.v1 {#MultiHashEmbed} @@ -134,12 +139,15 @@ definitions depending on the `Vocab` of the `Doc` object passed in. Vectors from pretrained static vectors can also be incorporated into the concatenated representation. + + | Name | Description | | ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. ~~int~~ | | `rows` | The number of rows for the embedding tables. Can be low, due to the hashing trick. Embeddings for prefix, suffix and word shape use half as many rows. Recommended values are between `2000` and `10000`. ~~int~~ | | `also_embed_subwords` | Whether to use the `PREFIX`, `SUFFIX` and `SHAPE` features in the embeddings. If not using these, you may need more rows in your hash embeddings, as there will be increased chance of collisions. ~~bool~~ | | `also_use_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. ~~bool~~ | +| **CREATES** | The model using the architecture. ~~Model~~ | ### spacy.CharacterEmbed.v1 {#CharacterEmbed} @@ -170,12 +178,15 @@ concatenated. A hash-embedded vector of the `NORM` of the word is also concatenated on, and the result is then passed through a feed-forward network to construct a single vector to represent the information. -| Name | Description | -| ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `width` | The width of the output vector and the `NORM` hash embedding. ~~int~~ | -| `rows` | The number of rows in the `NORM` hash embedding table. ~~int~~ | -| `nM` | The dimensionality of the character embeddings. Recommended values are between `16` and `64`. ~~int~~ | -| `nC` | The number of UTF-8 bytes to embed per word. Recommended values are between `3` and `8`, although it may depend on the length of words in the language. ~~int~~ | + + +| Name | Description | +| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `width` | The width of the output vector and the `NORM` hash embedding. ~~int~~ | +| `rows` | The number of rows in the `NORM` hash embedding table. ~~int~~ | +| `nM` | The dimensionality of the character embeddings. Recommended values are between `16` and `64`. ~~int~~ | +| `nC` | The number of UTF-8 bytes to embed per word. Recommended values are between `3` and `8`, although it may depend on the length of words in the language. ~~int~~ | +| **CREATES** | The model using the architecture. ~~Model~~ | ### spacy.MaxoutWindowEncoder.v1 {#MaxoutWindowEncoder} @@ -199,6 +210,7 @@ and residual connections. | `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ | | `maxout_pieces` | The number of maxout pieces to use. Recommended values are `2` or `3`. ~~int~~ | | `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~ | ### spacy.MishWindowEncoder.v1 {#MishWindowEncoder} @@ -221,6 +233,7 @@ and residual connections. | `width` | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ | | `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ | | `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~ | ### spacy.TorchBiLSTMEncoder.v1 {#TorchBiLSTMEncoder} @@ -242,10 +255,38 @@ Encode context using bidirectional LSTM layers. Requires | `width` | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ | | `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ | | `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~ | ### spacy.StaticVectors.v1 {#StaticVectors} - +> #### Example config +> +> ```ini +> [model] +> @architectures = "spacy.StaticVectors.v1" +> nO = null +> nM = null +> dropout = 0.2 +> key_attr = "ORTH" +> +> [model.init_W] +> @initializers = "glorot_uniform_init.v1" +> ``` + +Embed [`Doc`](/api/doc) objects with their vocab's vectors table, applying a +learned linear projection to control the dimensionality. See the documentation +on [static vectors](/usage/embeddings-transformers#static-vectors) for details. + + + +| Name |  Description | +| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `nO` | Defaults to `None`. ~~Optional[int]~~ | +| `nM` | Defaults to `None`. ~~Optional[int]~~ | +| `dropout` | Optional dropout rate. If set, it's applied per dimension over the whole batch. Defaults to `None`. ~~Optional[float]~~ | +| `init_W` | The [initialization function](https://thinc.ai/docs/api-initializers). Defaults to [`glorot_uniform_init`](https://thinc.ai/docs/api-initializers#glorot_uniform_init). ~~Callable[[Ops, Tuple[int, ...]]], FloatsXd]~~ | +| `key_attr` | Defaults to `"ORTH"`. ~~str~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ | ## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"} @@ -277,6 +318,7 @@ architectures into your training config. | `name` | Any model name that can be loaded by [`transformers.AutoModel`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoModel). ~~str~~ | | `get_spans` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. ~~Callable[[List[Doc]], List[Span]]~~ | | `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], FullTransformerBatch]~~ | ### spacy-transformers.Tok2VecListener.v1 {#transformers-Tok2VecListener} @@ -305,6 +347,7 @@ a single token vector given zero or more wordpiece vectors. | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `pooling` | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~ | | `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer} @@ -330,6 +373,7 @@ one component. | `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ | | `pooling` | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~ | | `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ## Parser & NER architectures {#parser} @@ -372,6 +416,8 @@ consists of either two or three subnetworks: state representation. If not present, the output from the lower model is used as action scores directly. + + | Name | Description | | ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | @@ -380,6 +426,7 @@ consists of either two or three subnetworks: | `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ | | `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ | | `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ | +| **CREATES** | The model using the architecture. ~~Model~~ | ### spacy.BILUOTagger.v1 {#BILUOTagger source="spacy/ml/models/simple_ner.py"} @@ -406,9 +453,10 @@ generally results in better linear separation between classes, especially for non-CRF models, because there are more distinct classes for the different situations ([Ratinov et al., 2009](https://www.aclweb.org/anthology/W09-1119/)). -| Name | Description | -| --------- | ------------------------------------------------------------------------------------------ | -| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------ | +| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ### spacy.IOBTagger.v1 {#IOBTagger source="spacy/ml/models/simple_ner.py"} @@ -431,9 +479,10 @@ spans into tags assigned to each token. The first token of a span is given the tag B-LABEL, and subsequent tokens are given the tag I-LABEL. All other tokens are assigned the tag O. -| Name | Description | -| --------- | ------------------------------------------------------------------------------------------ | -| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------ | +| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"} @@ -454,10 +503,11 @@ Build a tagger model, using a provided token-to-vector component. The tagger model simply adds a linear layer with softmax activation to predict scores given the token vectors. -| Name | Description | -| --------- | ------------------------------------------------------------------------------------------ | -| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | -| `nO` | The number of tags to output. Inferred from the data if `None`. ~~Optional[int]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------ | +| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | +| `nO` | The number of tags to output. Inferred from the data if `None`. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"} @@ -474,9 +524,6 @@ specific data and challenge. ### spacy.TextCatEnsemble.v1 {#TextCatEnsemble} -Stacked ensemble of a bag-of-words model and a neural network model. The neural -network has an internal CNN Tok2Vec layer and uses attention. - > #### Example Config > > ```ini @@ -493,6 +540,11 @@ network has an internal CNN Tok2Vec layer and uses attention. > nO = null > ``` +Stacked ensemble of a bag-of-words model and a neural network model. The neural +network has an internal CNN Tok2Vec layer and uses attention. + + + | Name | Description | | -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | @@ -504,6 +556,7 @@ network has an internal CNN Tok2Vec layer and uses attention. | `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ | | `dropout` | The dropout rate. ~~float~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model~~ | ### spacy.TextCatCNN.v1 {#TextCatCNN} @@ -530,11 +583,14 @@ A neural network model where token vectors are calculated using a CNN. The vectors are mean pooled and used as features in a feed-forward network. This architecture is usually less accurate than the ensemble, but runs faster. + + | Name | Description | | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | | `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model~~ | ### spacy.TextCatBOW.v1 {#TextCatBOW} @@ -552,12 +608,15 @@ architecture is usually less accurate than the ensemble, but runs faster. An ngram "bag-of-words" model. This architecture should run much faster than the others, but may not be as accurate, especially if texts are short. + + | Name | Description | | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | | `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ | | `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model~~ | ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"} @@ -574,9 +633,6 @@ into the "real world". This requires 3 main component ### spacy.EntityLinker.v1 {#EntityLinker} -The `EntityLinker` model architecture is a Thinc `Model` with a -[`Linear`](https://thinc.ai/api-layers#linear) output layer. - > #### Example Config > > ```ini @@ -602,10 +658,16 @@ The `EntityLinker` model architecture is a Thinc `Model` with a > @assets = "spacy.CandidateGenerator.v1" > ``` -| Name | Description | -| --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ | -| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `begin_training` is called. ~~Optional[int]~~ | +The `EntityLinker` model architecture is a Thinc `Model` with a +[`Linear`](https://thinc.ai/api-layers#linear) output layer. + + + +| Name | Description | +| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ | +| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `begin_training` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model~~ | ### spacy.EmptyKB.v1 {#EmptyKB} diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index b614898df..c7a1c3f06 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -719,11 +719,11 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc] Generate an installable [model Python package](/usage/training#models-generating) from an existing model -data directory. All data files are copied over. If the path to a `meta.json` is -supplied, or a `meta.json` is found in the input directory, this file is used. -Otherwise, the data can be entered directly from the command line. spaCy will -then create a `.tar.gz` archive file that you can distribute and install with -`pip install`. +data directory. All data files are copied over. If the path to a +[`meta.json`](/api/data-formats#meta) is supplied, or a `meta.json` is found in +the input directory, this file is used. Otherwise, the data can be entered +directly from the command line. spaCy will then create a `.tar.gz` archive file +that you can distribute and install with `pip install`. @@ -750,7 +750,7 @@ $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] | ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `input_dir` | Path to directory containing model data. ~~Path (positional)~~ | | `output_dir` | Directory to create package folder in. ~~Path (positional)~~ | -| `--meta-path`, `-m` 2 | Path to `meta.json` file (optional). ~~Optional[Path] \(option)~~ | +| `--meta-path`, `-m` 2 | Path to [`meta.json`](/api/data-formats#meta) file (optional). ~~Optional[Path] \(option)~~ | | `--create-meta`, `-C` 2 | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. ~~bool (flag)~~ | | `--no-sdist`, `-NS`, | Don't build the `.tar.gz` sdist automatically. Can be set if you want to run this step manually. ~~bool (flag)~~ | | `--version`, `-v` 3 | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. ~~Optional[str] \(option)~~ | diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 4577d7ef3..56528de43 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -6,6 +6,7 @@ menu: - ['Training Data', 'training'] - ['Pretraining Data', 'pretraining'] - ['Vocabulary', 'vocab'] + - ['Model Meta', 'meta'] --- This section documents input and output formats of data used by spaCy, including @@ -73,15 +74,15 @@ your config and check that it's valid, you can run the Defines the `nlp` object, its tokenizer and [processing pipeline](/usage/processing-pipelines) component names. -| Name | Description | Default | -| ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------- | -| `lang` | The language code to use. ~~str~~ | `null` | -| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). ~~List[str]~~ | `[]` | -| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. ~~bool~~ | `true` | -| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ | `null` | -| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. ~~Optional[Callable[[Language], Language]]~~ | `null` | -| `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. ~~Optional[Callable[[Language], Language]]~~ | `null` | -| `tokenizer` | The tokenizer to use. ~~Callable[[str], Doc]~~ | [`Tokenizer`](/api/tokenizer) | +| Name | Description | +| ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ | +| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ | +| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~ | +| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ | +| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | +| `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | +| `tokenizer` | The tokenizer to use. Defaults to [`Tokenizer`](/api/tokenizer). ~~Callable[[str], Doc]~~ | ### components {#config-components tag="section"} @@ -128,24 +129,24 @@ process that are used when you run [`spacy train`](/api/cli#train). -| Name | Description | Default | -| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------- | -| `seed` | The random seed. ~~int~~ | `${system:seed}` | -| `dropout` | The dropout rate. ~~float~~ | `0.1` | -| `accumulate_gradient` | Whether to divide the batch up into substeps. ~~int~~ | `1` | -| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). ~~Optional[str]~~ | `${paths:init_tok2vec}` | -| `raw_text` | ~~Optional[str]~~ | `${paths:raw}` | -| `vectors` | ~~Optional[str]~~ | `null` | -| `patience` | How many steps to continue without improvement in evaluation score. ~~int~~ | `1600` | -| `max_epochs` | Maximum number of epochs to train for. ~~int~~ | `0` | -| `max_steps` | Maximum number of update steps to train for. ~~int~~ | `20000` | -| `eval_frequency` | How often to evaluate during training (steps). ~~int~~ | `200` | -| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. ~~Dict[str, float]~~ | `{}` | -| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. ~~List[str]~~ | `[]` | -| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. ~~Callable[[Language], Iterator[Example]]~~ | [`Corpus`](/api/corpus) | -| `dev_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. ~~Callable[[Language], Iterator[Example]]~~ | [`Corpus`](/api/corpus) | -| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | [`batch_by_words`](/api/top-level#batch_by_words) | -| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. ~~Optimizer~~ | [`Adam`](https://thinc.ai/docs/api-optimizers#adam) | +| Name | Description | +| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `seed` | The random seed. Defaults to variable `${system:seed}`. ~~int~~ | +| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | +| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | +| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths:init_tok2vec}`. ~~Optional[str]~~ | +| `raw_text` | TODO: ... Defaults to variable `${paths:raw}`. ~~Optional[str]~~ | +| `vectors` | Model name or path to model containing pretrained word vectors to use, e.g. created with [`init model`](/api/cli#init-model). Defaults to `null`. ~~Optional[str]~~ | +| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | +| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | +| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | +| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | +| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | +| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | +| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/corpus). ~~Callable[[Language], Iterator[Example]]~~ | +| `dev_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/corpus). ~~Callable[[Language], Iterator[Example]]~~ | +| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | +| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | ### pretraining {#config-pretraining tag="section,optional"} @@ -153,19 +154,19 @@ This section is optional and defines settings and controls for [language model pretraining](/usage/training#pretraining). It's used when you run [`spacy pretrain`](/api/cli#pretrain). -| Name | Description | Default | -| ---------------------------- | ----------------------------------------------------------------------------------------------------------- | --------------------------------------------------- | -| `max_epochs` | Maximum number of epochs. ~~int~~ | `1000` | -| `min_length` | Minimum length of examples. ~~int~~ | `5` | -| `max_length` | Maximum length of examples. ~~int~~ | `500` | -| `dropout` | The dropout rate. ~~float~~ | `0.2` | -| `n_save_every` | Saving frequency. ~~int~~ | `null` | -| `batch_size` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). ~~Union[int, Sequence[int]]~~ | `3000` | -| `seed` | The random seed. ~~int~~ | `${system.seed}` | -| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. ~~bool~~ | `${system:use_pytorch_for_gpu_memory}` | -| `tok2vec_model` | tok2vec model section in the config. ~~str~~ | `"components.tok2vec.model"` | -| `objective` | The pretraining objective. ~~Dict[str, Any]~~ | `{"type": "characters", "n_characters": 4}` | -| `optimizer` | The optimizer. ~~Optimizer~~ | [`Adam`](https://thinc.ai/docs/api-optimizers#adam) | +| Name | Description | +| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------- | +| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ | +| `min_length` | Minimum length of examples. Defaults to `5`. ~~int~~ | +| `max_length` | Maximum length of examples. Defaults to `500`. ~~int~~ | +| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ | +| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ | +| `batch_size` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). Defaults to `3000`. ~~Union[int, Sequence[int]]~~ | +| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | +| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system:use_pytorch_for_gpu_memory}`. ~~bool~~ | +| `tok2vec_model` | The model section of the embedding component in the config. Defaults to `"components.tok2vec.model"`. ~~str~~ | +| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ | +| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | ## Training data {#training} @@ -372,11 +373,11 @@ example = Example.from_dict(doc, gold_dict) ## Pretraining data {#pretraining} -The [`spacy pretrain`](/api/cli#pretrain) command lets you pretrain the tok2vec -layer of pipeline components from raw text. Raw text can be provided as a -`.jsonl` (newline-delimited JSON) file containing one input text per line -(roughly paragraph length is good). Optionally, custom tokenization can be -provided. +The [`spacy pretrain`](/api/cli#pretrain) command lets you pretrain the +"token-to-vector" embedding layer of pipeline components from raw text. Raw text +can be provided as a `.jsonl` (newline-delimited JSON) file containing one input +text per line (roughly paragraph length is good). Optionally, custom +tokenization can be provided. > #### Tip: Writing JSONL > @@ -457,3 +458,75 @@ Here's an example of the 20 most frequent lexemes in the English training data: ```json https://github.com/explosion/spaCy/tree/master/examples/training/vocab-data.jsonl ``` + +## Model meta {#meta} + +The model meta is available as the file `meta.json` and exported automatically +when you save an `nlp` object to disk. Its contents are available as +[`nlp.meta`](/api/language#meta). + + + +As of spaCy v3.0, the `meta.json` **isn't** used to construct the language class +and pipeline anymore and only contains meta information for reference and for +creating a Python package with [`spacy package`](/api/cli#package). How to set +up the `nlp` object is now defined in the +[`config.cfg`](/api/data-formats#config), which includes detailed information +about the pipeline components and their model architectures, and all other +settings and hyperparameters used to train the model. It's the **single source +of truth** used for loading a model. + + + +> #### Example +> +> ```json +> { +> "name": "example_model", +> "lang": "en", +> "version": "1.0.0", +> "spacy_version": ">=3.0.0,<3.1.0", +> "parent_package": "spacy", +> "description": "Example model for spaCy", +> "author": "You", +> "email": "you@example.com", +> "url": "https://example.com", +> "license": "CC BY-SA 3.0", +> "sources": [{ "name": "My Corpus", "license": "MIT" }], +> "vectors": { "width": 0, "vectors": 0, "keys": 0, "name": null }, +> "pipeline": ["tok2vec", "ner", "textcat"], +> "labels": { +> "ner": ["PERSON", "ORG", "PRODUCT"], +> "textcat": ["POSITIVE", "NEGATIVE"] +> }, +> "accuracy": { +> "ents_f": 82.7300930714, +> "ents_p": 82.135523614, +> "ents_r": 83.3333333333, +> "textcat_score": 88.364323811 +> }, +> "speed": { "cpu": 7667.8, "gpu": null, "nwords": 10329 }, +> "spacy_git_version": "61dfdd9fb" +> } +> ``` + +| Name | Description | +| ---------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `"en"`. ~~str~~ | +| `name` | Model name, e.g. `"core_web_sm"`. The final model package name will be `{lang}_{name}`. Defaults to `"model"`. ~~str~~ | +| `version` | Model version. Will be used to version a Python package created with [`spacy package`](/api/cli#package). Defaults to `"0.0.0"`. ~~str~~ | +| `spacy_version` | spaCy version range the model is compatible with. Defaults to spaCy version used to create the model, up to next minor version, which is the default compatibility for the available [pretrained models](/models). For instance, a model trained with v3.0.0 will have the version range `">=3.0.0,<3.1.0"`. ~~str~~ | +| `parent_package` | Name of the spaCy package. Typically `"spacy"` or `"spacy_nightly"`. Defaults to `"spacy"`. ~~str~~ | +| `description` | Model description. Also used for Python package. Defaults to `""`. ~~str~~ | +| `author` | Model author name. Also used for Python package. Defaults to `""`. ~~str~~ | +| `email` | Model author email. Also used for Python package. Defaults to `""`. ~~str~~ | +| `url` | Model author URL. Also used for Python package. Defaults to `""`. ~~str~~ | +| `license` | Model license. Also used for Python package. Defaults to `""`. ~~str~~ | +| `sources` | Data sources used to train the model. Typically a list of dicts with the keys `"name"`, `"url"`, `"author"` and `"license"`. [See here](https://github.com/explosion/spacy-models/tree/master/meta) for examples. Defaults to `None`. ~~Optional[List[Dict[str, str]]]~~ | +| `vectors` | Information about the word vectors included with the model. Typically a dict with the keys `"width"`, `"vectors"` (number of vectors), `"keys"` and `"name"`. ~~Dict[str, Any]~~ | +| `pipeline` | Names of pipeline component names in the model, in order. Corresponds to [`nlp.pipe_names`](/api/language#pipe_names). Only exists for reference and is not used to create the components. This information is defined in the [`config.cfg`](/api/data-formats#config). Defaults to `[]`. ~~List[str]~~ | +| `labels` | Label schemes of the trained pipeline components, keyed by component name. Corresponds to [`nlp.pipe_labels`](/api/language#pipe_labels). [See here](https://github.com/explosion/spacy-models/tree/master/meta) for examples. Defaults to `{}`. ~~Dict[str, Dict[str, List[str]]]~~ | +| `accuracy` | Training accuracy, added automatically by [`spacy train`](/api/cli#train). Dictionary of [score names](/usage/training#metrics) mapped to scores. Defaults to `{}`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | +| `speed` | Model speed, added automatically by [`spacy train`](/api/cli#train). Typically a dictionary with the keys `"cpu"`, `"gpu"` and `"nwords"` (words per second). Defaults to `{}`. ~~Dict[str, Optional[Union[float, str]]]~~ | +| `spacy_git_version` 3 | Git commit of [`spacy`](https://github.com/explosion/spaCy) used to create model. ~~str~~ | +| other | Any other custom meta information you want to add. The data is preserved in [`nlp.meta`](/api/language#meta). ~~Any~~ | diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 871adc0f2..34e3569a7 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -742,7 +742,7 @@ token.ent_iob, token.ent_type Custom meta data for the Language class. If a model is loaded, contains meta data of the model. The `Language.meta` is also what's serialized as the -`meta.json` when you save an `nlp` object to disk. +[`meta.json`](/api/data-formats#meta) when you save an `nlp` object to disk. > #### Example > @@ -954,12 +954,12 @@ serialization by passing in the string names via the `exclude` argument. > nlp.from_disk("./model-data", exclude=["ner"]) > ``` -| Name | Description | -| ----------- | -------------------------------------------------- | -| `vocab` | The shared [`Vocab`](/api/vocab). | -| `tokenizer` | Tokenization rules and exceptions. | -| `meta` | The meta data, available as `Language.meta`. | -| ... | String names of pipeline components, e.g. `"ner"`. | +| Name | Description | +| ----------- | ------------------------------------------------------------------ | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `tokenizer` | Tokenization rules and exceptions. | +| `meta` | The meta data, available as [`Language.meta`](/api/language#meta). | +| ... | String names of pipeline components, e.g. `"ner"`. | ## FactoryMeta {#factorymeta new="3" tag="dataclass"} diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md index 833a50b33..deb8369ab 100644 --- a/website/docs/api/tok2vec.md +++ b/website/docs/api/tok2vec.md @@ -15,7 +15,7 @@ multiple components, e.g. to have one embedding and CNN network shared between a [`EntityRecognizer`](/api/entityrecognizer). In order to use the `Tok2Vec` predictions, subsequent components should use the -[Tok2VecListener](/api/architectures#Tok2VecListener) layer as the tok2vec +[Tok2VecListener](/api/architectures#Tok2VecListener) layer as the `tok2vec` subnetwork of their model. This layer will read data from the `doc.tensor` attribute during prediction. During training, the `Tok2Vec` component will save its prediction and backprop callback for each batch, so that the subsequent diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index c44e4e5b4..de0b3d36c 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -18,9 +18,10 @@ Load a model using the name of an installed `Path`-like object. spaCy will try resolving the load argument in this order. If a model is loaded from a model name, spaCy will assume it's a Python package and import it and call the model's own `load()` method. If a model is loaded from a -path, spaCy will assume it's a data directory, read the language and pipeline -settings off the meta.json and initialize the `Language` class. The data will be -loaded in via [`Language.from_disk`](/api/language#from_disk). +path, spaCy will assume it's a data directory, load its +[`config.cfg`](/api/data-formats#config) and use the language and pipeline +information to construct the `Language` class. The data will be loaded in via +[`Language.from_disk`](/api/language#from_disk). > #### Example > @@ -40,9 +41,10 @@ loaded in via [`Language.from_disk`](/api/language#from_disk). | `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | | **RETURNS** | A `Language` object with the loaded model. ~~Language~~ | -Essentially, `spacy.load()` is a convenience wrapper that reads the language ID -and pipeline components from a model's `meta.json`, initializes the `Language` -class, loads in the model data and returns it. +Essentially, `spacy.load()` is a convenience wrapper that reads the model's +[`config.cfg`](/api/data-formats#config), uses the language and pipeline +information to construct a `Language` object, loads in the model data and +returns it. ```python ### Abstract example @@ -543,8 +545,8 @@ loaded lazily, to avoid expensive setup code associated with the language data. Load a model from a package or data path. If called with a package name, spaCy will assume the model is a Python package and import and call its `load()` method. If called with a path, spaCy will assume it's a data directory, read the -language and pipeline settings from the meta.json and initialize a `Language` -class. The model data will then be loaded in via +language and pipeline settings from the [`config.cfg`](/api/data-formats#config) +and create a `Language` object. The model data will then be loaded in via [`Language.from_disk`](/api/language#from_disk). > #### Example @@ -607,7 +609,8 @@ components are created, as well as all training settings and hyperparameters. ### util.load_meta {#util.load_meta tag="function" new="3"} -Get a model's `meta.json` from a file path and validate its contents. +Get a model's [`meta.json`](/api/data-formats#meta) from a file path and +validate its contents. > #### Example > diff --git a/website/docs/images/tok2vec-listener.svg b/website/docs/images/tok2vec-listener.svg new file mode 100644 index 000000000..bb67d2186 --- /dev/null +++ b/website/docs/images/tok2vec-listener.svg @@ -0,0 +1,41 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/website/docs/images/tok2vec.svg b/website/docs/images/tok2vec.svg new file mode 100644 index 000000000..5338b6280 --- /dev/null +++ b/website/docs/images/tok2vec.svg @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 23037a3ab..df9e68282 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -9,11 +9,7 @@ menu: next: /usage/training --- - - -## Shared embedding layers {#embedding-layers} - - + @@ -55,6 +51,22 @@ of performance. +## Shared embedding layers {#embedding-layers} + + + +![Pipeline components using a shared embedding component vs. independent embedding layers](../images/tok2vec.svg) + +| Shared | Independent | +| ------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------- | +| ✅ **smaller:** models only need to include a single copy of the embeddings | ❌ **larger:** models need to include the embeddings for each component | +| ✅ **faster:** | ❌ **slower:** | +| ❌ **less composable:** all components require the same embedding component in the pipeline | ✅ **modular:** components can be moved and swapped freely | + +![Pipeline components listening to shared embedding component](../images/tok2vec-listener.svg) + + + ## Using transformer models {#transformers} Transformers are a family of neural network architectures that compute **dense, @@ -295,18 +307,6 @@ is, a Thinc model that takes a list of [`Doc`](/api/doc) objects, and returns a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) object with the transformer data. -> #### Model type annotations -> -> In the documentation and code base, you may come across type annotations and -> descriptions of [Thinc](https://thinc.ai) model types, like ~~Model[List[Doc], -> List[Floats2d]]~~. This so-called generic type describes the layer and its -> input and output type – in this case, it takes a list of `Doc` objects as the -> input and list of 2-dimensional arrays of floats as the output. You can read -> more about defining Thinc models [here](https://thinc.ai/docs/usage-models). -> Also see the [type checking](https://thinc.ai/docs/usage-type-checking) for -> how to enable linting in your editor to see live feedback if your inputs and -> outputs don't match. - The same idea applies to task models that power the **downstream components**. Most of spaCy's built-in model creation functions support a `tok2vec` argument, which should be a Thinc layer of type ~~Model[List[Doc], List[Floats2d]]~~. This diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 1ea39fa83..be98cd36c 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -70,8 +70,7 @@ import Languages from 'widgets/languages.js' > nlp = MultiLanguage() > > # With lazy-loading -> from spacy.util import get_lang_class -> nlp = get_lang_class('xx') +> nlp = spacy.blank("xx") > ``` spaCy also supports models trained on more than one language. This is especially @@ -80,10 +79,10 @@ language-neutral models is `xx`. The language class, a generic subclass containing only the base language data, can be found in [`lang/xx`](https://github.com/explosion/spaCy/tree/master/spacy/lang/xx). -To load your model with the neutral, multi-language class, simply set -`"language": "xx"` in your [model package](/usage/training#models-generating)'s -`meta.json`. You can also import the class directly, or call -[`util.get_lang_class()`](/api/top-level#util.get_lang_class) for lazy-loading. +To train a model using the neutral multi-language class, you can set +`lang = "xx"` in your [training config](/usage/training#config). You can also +import the `MultiLanguage` class directly, or call +[`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading. ### Chinese language support {#chinese new=2.3} @@ -308,12 +307,14 @@ model data. ```yaml ### Directory structure {highlight="7"} └── en_core_web_md-3.0.0.tar.gz # downloaded archive - ├── meta.json # model meta data ├── setup.py # setup file for pip installation + ├── meta.json # copy of model meta └── en_core_web_md # 📦 model package ├── __init__.py # init for pip installation - ├── meta.json # model meta data └── en_core_web_md-3.0.0 # model data + ├── config.cfg # model config + ├── meta.json # model meta + └── ... # directories with component data ``` You can place the **model package directory** anywhere on your local file diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 2b040a832..73ad88bcc 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -232,7 +232,7 @@ available pipeline components and component functions. | `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. | | `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. | | `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. | -| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | | +| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | Assign token-to-vector embeddings. | | `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. | ### Disabling and modifying pipeline components {#disabling} diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 66a3daf6e..ce6625897 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -1096,11 +1096,12 @@ ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}]) nlp.to_disk("/path/to/model") ``` -The saved model now includes the `"entity_ruler"` in its `"pipeline"` setting in -the `meta.json`, and the model directory contains a file `entityruler.jsonl` -with the patterns. When you load the model back in, all pipeline components will -be restored and deserialized – including the entity ruler. This lets you ship -powerful model packages with binary weights _and_ rules included! +The saved model now includes the `"entity_ruler"` in its +[`config.cfg`](/api/data-formats#config) and the model directory contains a file +`entityruler.jsonl` with the patterns. When you load the model back in, all +pipeline components will be restored and deserialized – including the entity +ruler. This lets you ship powerful model packages with binary weights _and_ +rules included! ### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"} diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index 904477733..f8bb1bfa9 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -569,9 +569,32 @@ back later. You can do this with the nlp.to_disk('/home/me/data/en_example_model') ``` -The directory will be created if it doesn't exist, and the whole pipeline will -be written out. To make the model more convenient to deploy, we recommend -wrapping it as a Python package. +The directory will be created if it doesn't exist, and the whole pipeline data, +model meta and model configuration will be written out. To make the model more +convenient to deploy, we recommend wrapping it as a +[Python package](/api/cli#package). + + + +When you save a model in spaCy v3.0+, two files will be exported: a +[`config.cfg`](/api/data-formats#config) based on +[`nlp.config`](/api/language#config) and a [`meta.json`](/api/data-formats#meta) +based on [`nlp.meta`](/api/language#meta). + +- **config**: Configuration used to create the current `nlp` object, its + pipeline components and models, as well as training settings and + hyperparameters. Can include references to registered functions like + [pipeline components](/usage/processing-pipelines#custom-components) or + [model architectures](/api/architectures). Given a config, spaCy is able + reconstruct the whole tree of objects and the `nlp` object. An exported config + can also be used to [train a model](/usage/training#conig) with the same + settings. +- **meta**: Meta information about the model and the Python package, such as the + author information, license, version, data sources and label scheme. This is + mostly used for documentation purposes and for packaging models. It has no + impact on the functionality of the `nlp` object. + + ### Generating a model package {#models-generating} @@ -623,6 +646,9 @@ model package that can be installed using `pip install`. ├── en_example_model # model directory │ ├── __init__.py # init for pip installation │ └── en_example_model-1.0.0 # model data + │ ├── config.cfg # model config + │ ├── meta.json # model meta + │ └── ... # directories with component data └── dist └── en_example_model-1.0.0.tar.gz # installable package ``` @@ -644,13 +670,25 @@ you can also **ship the code with your model** and include it in the [pipeline components](/usage/processing-pipelines#custom-components) before the `nlp` object is created. + + +While it's no problem to edit the package code or meta information, avoid making +edits to the `config.cfg` **after** training, as this can easily lead to data +incompatibility. For instance, changing an architecture or hyperparameter can +mean that the trained weights are now incompatible. If you want to make +adjustments, you can do so before training. Otherwise, you should always trust +spaCy to export the current state of its `nlp` objects via +[`nlp.config`](/api/language#config). + + + ### Loading a custom model package {#loading} To load a model from a data directory, you can use [`spacy.load()`](/api/top-level#spacy.load) with the local path. This will look -for a meta.json in the directory and use the `lang` and `pipeline` settings to -initialize a `Language` class with a processing pipeline and load in the model -data. +for a `config.cfg` in the directory and use the `lang` and `pipeline` settings +to initialize a `Language` class with a processing pipeline and load in the +model data. ```python nlp = spacy.load("/path/to/model") diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 3943cf061..31ba902b0 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -384,19 +384,40 @@ that reference this variable. ### Model architectures {#model-architectures} -A **model architecture** is a function that wires up a Thinc `Model` instance, -which you can then use in a component or as a layer of a larger network. You -can use Thinc as a thin wrapper around frameworks such as PyTorch, Tensorflow -or MXNet, or you can implement your logic in Thinc directly. +> #### 💡 Model type annotations +> +> In the documentation and code base, you may come across type annotations and +> descriptions of [Thinc](https://thinc.ai) model types, like ~~Model[List[Doc], +> List[Floats2d]]~~. This so-called generic type describes the layer and its +> input and output type – in this case, it takes a list of `Doc` objects as the +> input and list of 2-dimensional arrays of floats as the output. You can read +> more about defining Thinc models [here](https://thinc.ai/docs/usage-models). +> Also see the [type checking](https://thinc.ai/docs/usage-type-checking) for +> how to enable linting in your editor to see live feedback if your inputs and +> outputs don't match. + +A **model architecture** is a function that wires up a Thinc +[`Model`](https://thinc.ai/docs/api-model) instance, which you can then use in a +component or as a layer of a larger network. You can use Thinc as a thin +[wrapper around frameworks](https://thinc.ai/docs/usage-frameworks) such as +PyTorch, TensorFlow or MXNet, or you can implement your logic in Thinc +[directly](https://thinc.ai/docs/usage-models). spaCy's built-in components will never construct their `Model` instances themselves, so you won't have to subclass the component to change its model -architecture. You can just update the config so that it refers -to a different registered function. Once the component has been created, its -model instance has already been assigned, so you cannot change its model -architecture. The architecture is like a recipe for the network, and you can't -change the recipe once the dish has already been prepared. You have to make -a new one. +architecture. You can just **update the config** so that it refers to a +different registered function. Once the component has been created, its `Model` +instance has already been assigned, so you cannot change its model architecture. +The architecture is like a recipe for the network, and you can't change the +recipe once the dish has already been prepared. You have to make a new one. +spaCy includes a variety of built-in [architectures](/api/architectures) for +different tasks. For example: + + + +| Architecture | Description | +| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [HashEmbedCNN](/api/architectures#HashEmbedCNN) | Build spaCy’s “standard” embedding layer, which uses hash embedding with subword features and a CNN with layer-normalized maxout. ~~Model[List[Doc], List[Floats2d]]~~ | ### Metrics, training output and weighted scores {#metrics} @@ -442,34 +463,15 @@ components are weighted equally. - - | Name | Description | | -------------------------- | ----------------------------------------------------------------------------------------------------------------------- | | **Loss** | The training loss representing the amount of work left for the optimizer. Should decrease, but usually not to `0`. | -| **Precision** (P) | Should increase. | -| **Recall** (R) | Should increase. | -| **F-Score** (F) | The weighted average of precision and recall. Should increase. | +| **Precision** (P) | Percentage of predicted annotations that were correct. Should increase. | +| **Recall** (R) | Percentage of reference annotations recovered. Should increase. | +| **F-Score** (F) | Harmonic mean of precision and recall. Should increase. | | **UAS** / **LAS** | Unlabeled and labeled attachment score for the dependency parser, i.e. the percentage of correct arcs. Should increase. | | **Words per second** (WPS) | Prediction speed in words per second. Should stay stable. | -Precision and recall are two common measurements of a model's accuracy. You -need precision and recall statistics whenever your model can return a variable -number of predictions, as in this situation there are two different ways your -model can be "accurate". - -Precision refers to the percentage of predicted annotations that were correct, -while recall refers to the percentage of reference annotations recovered. -A model that only returns one entity for a document will have precision 1.0 if -that entity is correct, but might have low recall if it has missed lots of -other correct entities. F-score is the harmonic mean of precision and recall. -The harmonic mean is used instead of the arithmetic mean so that systems with -very low precision or very low recall will score lower than systems that -achieve a balance of the two. - - - - Note that if the development data has raw text, some of the gold-standard entities might not align to the predicted tokenization. These tokenization errors are **excluded from the NER evaluation**. If your tokenization makes it diff --git a/website/docs/usage/transformers.md b/website/docs/usage/transformers.md deleted file mode 100644 index 79ac8177f..000000000 --- a/website/docs/usage/transformers.md +++ /dev/null @@ -1,305 +0,0 @@ ---- -title: Transformers -teaser: Using transformer models like BERT in spaCy -menu: - - ['Installation', 'install'] - - ['Runtime Usage', 'runtime'] - - ['Training Usage', 'training'] -next: /usage/training ---- - -## Installation {#install hidden="true"} - -Transformers are a family of neural network architectures that compute **dense, -context-sensitive representations** for the tokens in your documents. Downstream -models in your pipeline can then use these representations as input features to -**improve their predictions**. You can connect multiple components to a single -transformer model, with any or all of those components giving feedback to the -transformer to fine-tune it to your tasks. spaCy's transformer support -interoperates with [PyTorch](https://pytorch.org) and the -[HuggingFace `transformers`](https://huggingface.co/transformers/) library, -giving you access to thousands of pretrained models for your pipelines. There -are many [great guides](http://jalammar.github.io/illustrated-transformer/) to -transformer models, but for practical purposes, you can simply think of them as -a drop-in replacement that let you achieve **higher accuracy** in exchange for -**higher training and runtime costs**. - -### System requirements - -We recommend an NVIDIA GPU with at least 10GB of memory in order to work with -transformer models. The exact requirements will depend on the transformer you -model you choose and whether you're training the pipeline or simply running it. -Training a transformer-based model without a GPU will be too slow for most -practical purposes. You'll also need to make sure your GPU drivers are -up-to-date and v9+ of the CUDA runtime is installed. - -Once you have CUDA installed, you'll need to install two pip packages, -[`cupy`](https://docs.cupy.dev/en/stable/install.html) and -[`spacy-transformers`](https://github.com/explosion/spacy-transformers). `cupy` -is just like `numpy`, but for GPU. The best way to install it is to choose a -wheel that matches the version of CUDA you're using. You may also need to set -the `CUDA_PATH` environment variable if your CUDA runtime is installed in a -non-standard location. Putting it all together, if you had installed CUDA 10.2 -in `/opt/nvidia/cuda`, you would run: - -```bash -### Installation with CUDA -export CUDA_PATH="/opt/nvidia/cuda" -pip install cupy-cuda102 -pip install spacy-transformers -``` - -Provisioning a new machine will require about 5GB of data to be downloaded in -total: 3GB for the CUDA runtime, 800MB for PyTorch, 400MB for CuPy, 500MB for -the transformer weights, and about 200MB for spaCy and its various requirements. - -## Runtime usage {#runtime} - -Transformer models can be used as **drop-in replacements** for other types of -neural networks, so your spaCy pipeline can include them in a way that's -completely invisible to the user. Users will download, load and use the model in -the standard way, like any other spaCy pipeline. Instead of using the -transformers as subnetworks directly, you can also use them via the -[`Transformer`](/api/transformer) pipeline component. - -![The processing pipeline with the transformer component](../images/pipeline_transformer.svg) - -The `Transformer` component sets the -[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute, -which lets you access the transformers outputs at runtime. - -```bash -$ python -m spacy download en_core_trf_lg -``` - -```python -### Example -import spacy -from thinc.api import use_pytorch_for_gpu_memory, require_gpu - -# Use the GPU, with memory allocations directed via PyTorch. -# This prevents out-of-memory errors that would otherwise occur from competing -# memory pools. -use_pytorch_for_gpu_memory() -require_gpu(0) - -nlp = spacy.load("en_core_trf_lg") -for doc in nlp.pipe(["some text", "some other text"]): - tokvecs = doc._.trf_data.tensors[-1] -``` - -You can also customize how the [`Transformer`](/api/transformer) component sets -annotations onto the [`Doc`](/api/doc), by customizing the `annotation_setter`. -This callback will be called with the raw input and output data for the whole -batch, along with the batch of `Doc` objects, allowing you to implement whatever -you need. The annotation setter is called with a batch of [`Doc`](/api/doc) -objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) -containing the transformers data for the batch. - -```python -def custom_annotation_setter(docs, trf_data): - # TODO: - ... - -nlp = spacy.load("en_core_trf_lg") -nlp.get_pipe("transformer").annotation_setter = custom_annotation_setter -doc = nlp("This is a text") -print() # TODO: -``` - -## Training usage {#training} - -The recommended workflow for training is to use spaCy's -[config system](/usage/training#config), usually via the -[`spacy train`](/api/cli#train) command. The training config defines all -component settings and hyperparameters in one place and lets you describe a tree -of objects by referring to creation functions, including functions you register -yourself. For details on how to get started with training your own model, check -out the [training quickstart](/usage/training#quickstart). - - - -The easiest way to get started is to clone a transformers-based project -template. Swap in your data, edit the settings and hyperparameters and train, -evaluate, package and visualize your model. - - - -The `[components]` section in the [`config.cfg`](/api/data-formats#config) -describes the pipeline components and the settings used to construct them, -including their model implementation. Here's a config snippet for the -[`Transformer`](/api/transformer) component, along with matching Python code. In -this case, the `[components.transformer]` block describes the `transformer` -component: - -> #### Python equivalent -> -> ```python -> from spacy_transformers import Transformer, TransformerModel -> from spacy_transformers.annotation_setters import null_annotation_setter -> from spacy_transformers.span_getters import get_doc_spans -> -> trf = Transformer( -> nlp.vocab, -> TransformerModel( -> "bert-base-cased", -> get_spans=get_doc_spans, -> tokenizer_config={"use_fast": True}, -> ), -> annotation_setter=null_annotation_setter, -> max_batch_items=4096, -> ) -> ``` - -```ini -### config.cfg (excerpt) -[components.transformer] -factory = "transformer" -max_batch_items = 4096 - -[components.transformer.model] -@architectures = "spacy-transformers.TransformerModel.v1" -name = "bert-base-cased" -tokenizer_config = {"use_fast": true} - -[components.transformer.model.get_spans] -@span_getters = "doc_spans.v1" - -[components.transformer.annotation_setter] -@annotation_setters = "spacy-transformer.null_annotation_setter.v1" - -``` - -The `[components.transformer.model]` block describes the `model` argument passed -to the transformer component. It's a Thinc -[`Model`](https://thinc.ai/docs/api-model) object that will be passed into the -component. Here, it references the function -[spacy-transformers.TransformerModel.v1](/api/architectures#TransformerModel) -registered in the [`architectures` registry](/api/top-level#registry). If a key -in a block starts with `@`, it's **resolved to a function** and all other -settings are passed to the function as arguments. In this case, `name`, -`tokenizer_config` and `get_spans`. - -`get_spans` is a function that takes a batch of `Doc` object and returns lists -of potentially overlapping `Span` objects to process by the transformer. Several -[built-in functions](/api/transformer#span-getters) are available – for example, -to process the whole document or individual sentences. When the config is -resolved, the function is created and passed into the model as an argument. - - - -Remember that the `config.cfg` used for training should contain **no missing -values** and requires all settings to be defined. You don't want any hidden -defaults creeping in and changing your results! spaCy will tell you if settings -are missing, and you can run -[`spacy init fill-config`](/api/cli#init-fill-config) to automatically fill in -all defaults. - - - -### Customizing the settings {#training-custom-settings} - -To change any of the settings, you can edit the `config.cfg` and re-run the -training. To change any of the functions, like the span getter, you can replace -the name of the referenced function – e.g. `@span_getters = "sent_spans.v1"` to -process sentences. You can also register your own functions using the -`span_getters` registry: - -> #### config.cfg -> -> ```ini -> [components.transformer.model.get_spans] -> @span_getters = "custom_sent_spans" -> ``` - -```python -### code.py -import spacy_transformers - -@spacy_transformers.registry.span_getters("custom_sent_spans") -def configure_custom_sent_spans(): - # TODO: write custom example - def get_sent_spans(docs): - return [list(doc.sents) for doc in docs] - - return get_sent_spans -``` - -To resolve the config during training, spaCy needs to know about your custom -function. You can make it available via the `--code` argument that can point to -a Python file. For more details on training with custom code, see the -[training documentation](/usage/training#custom-code). - -```bash -$ python -m spacy train ./config.cfg --code ./code.py -``` - -### Customizing the model implementations {#training-custom-model} - -The [`Transformer`](/api/transformer) component expects a Thinc -[`Model`](https://thinc.ai/docs/api-model) object to be passed in as its `model` -argument. You're not limited to the implementation provided by -`spacy-transformers` – the only requirement is that your registered function -must return an object of type ~~Model[List[Doc], FullTransformerBatch]~~: that -is, a Thinc model that takes a list of [`Doc`](/api/doc) objects, and returns a -[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) object with the -transformer data. - -> #### Model type annotations -> -> In the documentation and code base, you may come across type annotations and -> descriptions of [Thinc](https://thinc.ai) model types, like ~~Model[List[Doc], -> List[Floats2d]]~~. This so-called generic type describes the layer and its -> input and output type – in this case, it takes a list of `Doc` objects as the -> input and list of 2-dimensional arrays of floats as the output. You can read -> more about defining Thinc models [here](https://thinc.ai/docs/usage-models). -> Also see the [type checking](https://thinc.ai/docs/usage-type-checking) for -> how to enable linting in your editor to see live feedback if your inputs and -> outputs don't match. - -The same idea applies to task models that power the **downstream components**. -Most of spaCy's built-in model creation functions support a `tok2vec` argument, -which should be a Thinc layer of type `Model[List[Doc], List[Floats2d]]`. This -is where we'll plug in our transformer model, using the -[Tok2VecListener](/api/architectures#Tok2VecListener) layer, which sneakily -delegates to the `Transformer` pipeline component. - -```ini -### config.cfg (excerpt) {highlight="12"} -[components.ner] -factory = "ner" - -[nlp.pipeline.ner.model] -@architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 3 -hidden_width = 128 -maxout_pieces = 3 -use_upper = false - -[nlp.pipeline.ner.model.tok2vec] -@architectures = "spacy-transformers.Tok2VecListener.v1" -grad_factor = 1.0 - -[nlp.pipeline.ner.model.tok2vec.pooling] -@layers = "reduce_mean.v1" -``` - -The [Tok2VecListener](/api/architectures#Tok2VecListener) layer expects a -[pooling layer](https://thinc.ai/docs/api-layers#reduction-ops) as the argument -`pooling`, which needs to be of type `Model[Ragged, Floats2d]`. This layer -determines how the vector for each spaCy token will be computed from the zero or -more source rows the token is aligned against. Here we use the -[`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean) layer, which -averages the wordpiece rows. We could instead use -[`reduce_max`](https://thinc.ai/docs/api-layers#reduce_max), or a custom -function you write yourself. - -You can have multiple components all listening to the same transformer model, -and all passing gradients back to it. By default, all of the gradients will be -**equally weighted**. You can control this with the `grad_factor` setting, which -lets you reweight the gradients from the different listeners. For instance, -setting `grad_factor = 0` would disable gradients from one of the listeners, -while `grad_factor = 2.0` would multiply them by 2. This is similar to having a -custom learning rate for each component. Instead of a constant, you can also -provide a schedule, allowing you to freeze the shared parameters at the start of -training. diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index a72baa7fa..ee90b6cc5 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -152,6 +152,7 @@ The following methods, attributes and commands are new in spaCy v3.0. | [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. | | [`Pipe.score`](/api/pipe#score) | Method on trainable pipeline components that returns a dictionary of evaluation scores. | | [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). | +| [`util.load_meta`](/api/top-level#util.load_meta) [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a model's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). | | [`init config`](/api/cli#init-config) [`init fill-config`](/api/cli#init-fill-config) [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). | | [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). | @@ -175,6 +176,11 @@ Note that spaCy v3.0 now requires **Python 3.6+**. There can be many [different models](/models) and not just one "English model", so you should always use the full model name like [`en_core_web_sm`](/models/en) explicitly. +- A model's [`meta.json`](/api/data-formats#meta) is now only used to provide + meta information like the model name, author, license and labels. It's **not** + used to construct the processing pipeline anymore. This is all defined in the + [`config.cfg`](/api/data-formats#config), which also includes all settings + used to train the model. - The [`train`](/api/cli#train) and [`pretrain`](/api/cli#pretrain) commands now only take a `config.cfg` file containing the full [training config](/usage/training#config). diff --git a/website/meta/type-annotations.json b/website/meta/type-annotations.json index 9bb1abbf4..3cfcf5f75 100644 --- a/website/meta/type-annotations.json +++ b/website/meta/type-annotations.json @@ -32,6 +32,7 @@ "Floats2d": "https://thinc.ai/docs/api-types#types", "Floats3d": "https://thinc.ai/docs/api-types#types", "FloatsXd": "https://thinc.ai/docs/api-types#types", + "Ops": "https://thinc.ai/docs/api-backends#ops", "cymem.Pool": "https://github.com/explosion/cymem", "preshed.BloomFilter": "https://github.com/explosion/preshed", "transformers.BatchEncoding": "https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding", diff --git a/website/src/components/icon.js b/website/src/components/icon.js index 322337955..8dfba7426 100644 --- a/website/src/components/icon.js +++ b/website/src/components/icon.js @@ -53,7 +53,15 @@ const icons = { package: PackageIcon, } -export default function Icon({ name, width = 20, height, inline = false, variant, className }) { +export default function Icon({ + name, + width = 20, + height, + inline = false, + variant, + className, + ...props +}) { const IconComponent = icons[name] const iconClassNames = classNames(classes.root, className, { [classes.inline]: inline, @@ -67,6 +75,7 @@ export default function Icon({ name, width = 20, height, inline = false, variant aria-hidden="true" width={width} height={height || width} + {...props} /> ) } diff --git a/website/src/components/table.js b/website/src/components/table.js index 3d442cde7..3f41a587b 100644 --- a/website/src/components/table.js +++ b/website/src/components/table.js @@ -9,19 +9,25 @@ function isNum(children) { return isString(children) && /^\d+[.,]?[\dx]+?(|x|ms|mb|gb|k|m)?$/i.test(children) } -function getCellContent(children) { +function getCellContent(cellChildren) { const icons = { - '✅': { name: 'yes', variant: 'success' }, - '❌': { name: 'no', variant: 'error' }, + '✅': { name: 'yes', variant: 'success', 'aria-label': 'positive' }, + '❌': { name: 'no', variant: 'error', 'aria-label': 'negative' }, } - - if (isString(children) && icons[children.trim()]) { - const iconProps = icons[children.trim()] - return - } - // Work around prettier auto-escape - if (isString(children) && children.startsWith('\\')) { - return children.slice(1) + let children = isString(cellChildren) ? [cellChildren] : cellChildren + if (Array.isArray(children)) { + return children.map((child, i) => { + if (isString(child)) { + const icon = icons[child.trim()] + if (icon) { + const props = { ...icon, inline: i < children.length, 'aria-hidden': undefined } + return + } + // Work around prettier auto-escape + if (child.startsWith('\\')) return child.slice(1) + } + return child + }) } return children } diff --git a/website/src/widgets/quickstart-training.js b/website/src/widgets/quickstart-training.js index 4e379e5ec..1a77cc338 100644 --- a/website/src/widgets/quickstart-training.js +++ b/website/src/widgets/quickstart-training.js @@ -38,7 +38,8 @@ const DATA = [ { id: 'optimize', title: 'Optimize for', - help: '...', + help: + 'Optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger & slower model). Will impact the choice of architecture, pretrained weights and hyperparameters.', options: [ { id: 'efficiency', title: 'efficiency', checked: DEFAULT_OPT === 'efficiency' }, { id: 'accuracy', title: 'accuracy', checked: DEFAULT_OPT === 'accuracy' }, @@ -84,10 +85,12 @@ export default function QuickstartTraining({ id, title, download = 'config.cfg' query={query} render={({ site }) => { const langs = site.siteMetadata.languages - DATA[0].dropdown = langs.map(({ name, code }) => ({ - id: code, - title: name, - })) + DATA[0].dropdown = langs + .map(({ name, code }) => ({ + id: code, + title: name, + })) + .sort((a, b) => a.id.localeCompare(b.id)) return (