From 728fec0194224ef2d58172511dc087c274b57e4e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 18 Aug 2020 00:49:19 +0200
Subject: [PATCH] Update docs [ci skip]

---
 netlify.toml                                  |   4 +-
 website/docs/api/architectures.md             |   6 +-
 website/docs/api/cli.md                       |  12 +-
 website/docs/api/top-level.md                 |   2 +-
 website/docs/api/transformer.md               |   3 +-
 website/docs/usage/101/_vectors-similarity.md |  14 +-
 website/docs/usage/embeddings-transformers.md | 459 ++++++++++++++++++
 website/docs/usage/linguistic-features.md     | 157 +++++-
 website/docs/usage/processing-pipelines.md    |  10 +-
 website/docs/usage/spacy-101.md               |   2 +-
 website/docs/usage/training.md                |   6 +-
 website/docs/usage/v2.md                      |   2 +-
 website/docs/usage/v3.md                      |  16 +-
 website/docs/usage/vectors-embeddings.md      | 340 -------------
 website/meta/sidebars.json                    |   7 +-
 website/src/components/typography.js          |  10 +-
 16 files changed, 665 insertions(+), 385 deletions(-)
 create mode 100644 website/docs/usage/embeddings-transformers.md
 delete mode 100644 website/docs/usage/vectors-embeddings.md

diff --git a/netlify.toml b/netlify.toml
index 6afa5ed7e..2f3e350e6 100644
--- a/netlify.toml
+++ b/netlify.toml
@@ -36,11 +36,11 @@ redirects = [
     {from = "/docs/api/features", to = "/models/#architecture", force = true},
     {from = "/docs/api/philosophy", to = "/usage/spacy-101", force = true},
     {from = "/docs/usage/showcase", to = "/universe", force = true},
-    {from = "/tutorials/load-new-word-vectors", to = "/usage/vectors-similarity#custom", force = true},
+    {from = "/tutorials/load-new-word-vectors", to = "/usage/linguistic-features", force = true},
     {from = "/tutorials", to = "/usage/examples", force = true},
     # Old documentation pages (v2.x)
     {from = "/usage/adding-languages", to = "/usage/linguistic-features", force = true},
-    {from = "/usage/vectors-similarity", to = "/usage/vectors-embeddings", force = true},
+    {from = "/usage/vectors-similarity", to = "/usage/linguistic-features#vectors-similarity", force = true},
     {from = "/api/goldparse", to = "/api/top-level", force = true},
     {from = "/api/goldcorpus", to = "/api/corpus", force = true},
     {from = "/api/annotation", to = "/api/data-formats", force = true},
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 3bc2ab578..8bb5cdeea 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -243,11 +243,15 @@ Encode context using bidirectional LSTM layers. Requires
 | `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~                                                                                           |
 | `depth`       | The number of convolutional layers. Recommended value is `4`. ~~int~~                                                                                                                                          |
 
+### spacy.StaticVectors.v1 {#StaticVectors}
+
+<!-- TODO: -->
+
 ## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
 
 The following architectures are provided by the package
 [`spacy-transformers`](https://github.com/explosion/spacy-transformers). See the
-[usage documentation](/usage/transformers) for how to integrate the
+[usage documentation](/usage/embeddings-transformers) for how to integrate the
 architectures into your training config.
 
 ### spacy-transformers.TransformerModel.v1 {#TransformerModel}
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index ec61eb0b5..b614898df 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -162,14 +162,12 @@ $ python -m spacy init fill-config [base_path] [output_file] [--diff]
 
 ### init model {#init-model new="2" tag="command"}
 
-<!-- TODO: update for v3 -->
-
 Create a new model directory from raw data, like word frequencies, Brown
-clusters and word vectors. This command is similar to the `spacy model` command
-in v1.x. Note that in order to populate the model's vocab, you need to pass in a
-JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as
-`--jsonl-loc` with optional `id` values that correspond to the vectors table.
-Just loading in vectors will not automatically populate the vocab.
+clusters and word vectors. Note that in order to populate the model's vocab, you
+need to pass in a JSONL-formatted
+[vocabulary file](/api/data-formats#vocab-jsonl) as `--jsonl-loc` with optional
+`id` values that correspond to the vectors table. Just loading in vectors will
+not automatically populate the vocab.
 
 <Infobox title="New in v3.0" variant="warning">
 
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 3bce3db93..0f87b8fd0 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -316,7 +316,7 @@ factories.
 The following registries are added by the
 [`spacy-transformers`](https://github.com/explosion/spacy-transformers) package.
 See the [`Transformer`](/api/transformer) API reference and
-[usage docs](/usage/transformers) for details.
+[usage docs](/usage/embeddings-transformers) for details.
 
 > #### Example
 >
diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md
index 19cb4daa2..d4d7de161 100644
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@@ -41,7 +41,8 @@ token, the spaCy token receives the sum of their values. To access the values,
 you can use the custom [`Doc._.trf_data`](#custom-attributes) attribute. The
 package also adds the function registries [`@span_getters`](#span_getters) and
 [`@annotation_setters`](#annotation_setters) with several built-in registered
-functions. For more details, see the [usage documentation](/usage/transformers).
+functions. For more details, see the
+[usage documentation](/usage/embeddings-transformers).
 
 ## Config and implementation {#config}
 
diff --git a/website/docs/usage/101/_vectors-similarity.md b/website/docs/usage/101/_vectors-similarity.md
index 9ff55f815..a04c96236 100644
--- a/website/docs/usage/101/_vectors-similarity.md
+++ b/website/docs/usage/101/_vectors-similarity.md
@@ -77,12 +77,14 @@ or flagging duplicates. For example, you can suggest a user content that's
 similar to what they're currently looking at, or label a support ticket as a
 duplicate if it's very similar to an already existing one.
 
-Each `Doc`, `Span` and `Token` comes with a
-[`.similarity()`](/api/token#similarity) method that lets you compare it with
-another object, and determine the similarity. Of course similarity is always
-subjective – whether "dog" and "cat" are similar really depends on how you're
-looking at it. spaCy's similarity model usually assumes a pretty general-purpose
-definition of similarity.
+Each [`Doc`](/api/doc), [`Span`](/api/span), [`Token`](/api/token) and
+[`Lexeme`](/api/lexeme) comes with a [`.similarity`](/api/token#similarity)
+method that lets you compare it with another object, and determine the
+similarity. Of course similarity is always subjective – whether "dog" and "cat"
+are similar really depends on how you're looking at it. spaCy's similarity model
+usually assumes a pretty general-purpose definition of similarity.
+
+<!-- TODO: use better example here -->
 
 ```python
 ### {executable="true"}
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
new file mode 100644
index 000000000..23037a3ab
--- /dev/null
+++ b/website/docs/usage/embeddings-transformers.md
@@ -0,0 +1,459 @@
+---
+title: Embeddings, Transformers and Transfer Learning
+teaser: Using transformer embeddings like BERT in spaCy
+menu:
+  - ['Embedding Layers', 'embedding-layers']
+  - ['Transformers', 'transformers']
+  - ['Static Vectors', 'static-vectors']
+  - ['Pretraining', 'pretraining']
+next: /usage/training
+---
+
+<!-- TODO: intro, short explanation of embeddings/transformers, point user to processing pipelines docs for intro -->
+
+## Shared embedding layers {#embedding-layers}
+
+<!-- TODO: write: `Tok2Vec` and `Transformer` components -->
+
+<Accordion title="What’s the difference between word vectors and language models?" id="vectors-vs-language-models">
+
+The key difference between [word vectors](#word-vectors) and contextual language
+models such as [transformers](#transformers) is that word vectors model
+**lexical types**, rather than _tokens_. If you have a list of terms with no
+context around them, a transformer model like BERT can't really help you. BERT
+is designed to understand language **in context**, which isn't what you have. A
+word vectors table will be a much better fit for your task. However, if you do
+have words in context — whole sentences or paragraphs of running text — word
+vectors will only provide a very rough approximation of what the text is about.
+
+Word vectors are also very computationally efficient, as they map a word to a
+vector with a single indexing operation. Word vectors are therefore useful as a
+way to **improve the accuracy** of neural network models, especially models that
+are small or have received little or no pretraining. In spaCy, word vector
+tables are only used as **static features**. spaCy does not backpropagate
+gradients to the pretrained word vectors table. The static vectors table is
+usually used in combination with a smaller table of learned task-specific
+embeddings.
+
+</Accordion>
+
+<Accordion title="When should I add word vectors to my model?">
+
+Word vectors are not compatible with most [transformer models](#transformers),
+but if you're training another type of NLP network, it's almost always worth
+adding word vectors to your model. As well as improving your final accuracy,
+word vectors often make experiments more consistent, as the accuracy you reach
+will be less sensitive to how the network is randomly initialized. High variance
+due to random chance can slow down your progress significantly, as you need to
+run many experiments to filter the signal from the noise.
+
+Word vector features need to be enabled prior to training, and the same word
+vectors table will need to be available at runtime as well. You cannot add word
+vector features once the model has already been trained, and you usually cannot
+replace one word vectors table with another without causing a significant loss
+of performance.
+
+</Accordion>
+
+## Using transformer models {#transformers}
+
+Transformers are a family of neural network architectures that compute **dense,
+context-sensitive representations** for the tokens in your documents. Downstream
+models in your pipeline can then use these representations as input features to
+**improve their predictions**. You can connect multiple components to a single
+transformer model, with any or all of those components giving feedback to the
+transformer to fine-tune it to your tasks. spaCy's transformer support
+interoperates with [PyTorch](https://pytorch.org) and the
+[HuggingFace `transformers`](https://huggingface.co/transformers/) library,
+giving you access to thousands of pretrained models for your pipelines. There
+are many [great guides](http://jalammar.github.io/illustrated-transformer/) to
+transformer models, but for practical purposes, you can simply think of them as
+a drop-in replacement that let you achieve **higher accuracy** in exchange for
+**higher training and runtime costs**.
+
+### Setup and installation {#transformers-installation}
+
+> #### System requirements
+>
+> We recommend an NVIDIA **GPU** with at least **10GB of memory** in order to
+> work with transformer models. Make sure your GPU drivers are up to date and
+> you have **CUDA v9+** installed.
+
+> The exact requirements will depend on the transformer model. Training a
+> transformer-based model without a GPU will be too slow for most practical
+> purposes.
+>
+> Provisioning a new machine will require about **5GB** of data to be
+> downloaded: 3GB CUDA runtime, 800MB PyTorch, 400MB CuPy, 500MB weights, 200MB
+> spaCy and dependencies.
+
+Once you have CUDA installed, you'll need to install two pip packages,
+[`cupy`](https://docs.cupy.dev/en/stable/install.html) and
+[`spacy-transformers`](https://github.com/explosion/spacy-transformers). `cupy`
+is just like `numpy`, but for GPU. The best way to install it is to choose a
+wheel that matches the version of CUDA you're using. You may also need to set
+the `CUDA_PATH` environment variable if your CUDA runtime is installed in a
+non-standard location. Putting it all together, if you had installed CUDA 10.2
+in `/opt/nvidia/cuda`, you would run:
+
+```bash
+### Installation with CUDA
+export CUDA_PATH="/opt/nvidia/cuda"
+pip install cupy-cuda102
+pip install spacy-transformers
+```
+
+### Runtime usage {#transformers-runtime}
+
+Transformer models can be used as **drop-in replacements** for other types of
+neural networks, so your spaCy pipeline can include them in a way that's
+completely invisible to the user. Users will download, load and use the model in
+the standard way, like any other spaCy pipeline. Instead of using the
+transformers as subnetworks directly, you can also use them via the
+[`Transformer`](/api/transformer) pipeline component.
+
+![The processing pipeline with the transformer component](../images/pipeline_transformer.svg)
+
+The `Transformer` component sets the
+[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute,
+which lets you access the transformers outputs at runtime.
+
+```bash
+$ python -m spacy download en_core_trf_lg
+```
+
+```python
+### Example
+import spacy
+from thinc.api import use_pytorch_for_gpu_memory, require_gpu
+
+# Use the GPU, with memory allocations directed via PyTorch.
+# This prevents out-of-memory errors that would otherwise occur from competing
+# memory pools.
+use_pytorch_for_gpu_memory()
+require_gpu(0)
+
+nlp = spacy.load("en_core_trf_lg")
+for doc in nlp.pipe(["some text", "some other text"]):
+    tokvecs = doc._.trf_data.tensors[-1]
+```
+
+You can also customize how the [`Transformer`](/api/transformer) component sets
+annotations onto the [`Doc`](/api/doc), by customizing the `annotation_setter`.
+This callback will be called with the raw input and output data for the whole
+batch, along with the batch of `Doc` objects, allowing you to implement whatever
+you need. The annotation setter is called with a batch of [`Doc`](/api/doc)
+objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch)
+containing the transformers data for the batch.
+
+```python
+def custom_annotation_setter(docs, trf_data):
+    # TODO:
+    ...
+
+nlp = spacy.load("en_core_trf_lg")
+nlp.get_pipe("transformer").annotation_setter = custom_annotation_setter
+doc = nlp("This is a text")
+print()  # TODO:
+```
+
+### Training usage {#transformers-training}
+
+The recommended workflow for training is to use spaCy's
+[config system](/usage/training#config), usually via the
+[`spacy train`](/api/cli#train) command. The training config defines all
+component settings and hyperparameters in one place and lets you describe a tree
+of objects by referring to creation functions, including functions you register
+yourself. For details on how to get started with training your own model, check
+out the [training quickstart](/usage/training#quickstart).
+
+<Project id="en_core_bert">
+
+The easiest way to get started is to clone a transformers-based project
+template. Swap in your data, edit the settings and hyperparameters and train,
+evaluate, package and visualize your model.
+
+</Project>
+
+The `[components]` section in the [`config.cfg`](/api/data-formats#config)
+describes the pipeline components and the settings used to construct them,
+including their model implementation. Here's a config snippet for the
+[`Transformer`](/api/transformer) component, along with matching Python code. In
+this case, the `[components.transformer]` block describes the `transformer`
+component:
+
+> #### Python equivalent
+>
+> ```python
+> from spacy_transformers import Transformer, TransformerModel
+> from spacy_transformers.annotation_setters import null_annotation_setter
+> from spacy_transformers.span_getters import get_doc_spans
+>
+> trf = Transformer(
+>     nlp.vocab,
+>     TransformerModel(
+>         "bert-base-cased",
+>         get_spans=get_doc_spans,
+>         tokenizer_config={"use_fast": True},
+>     ),
+>     annotation_setter=null_annotation_setter,
+>     max_batch_items=4096,
+> )
+> ```
+
+```ini
+### config.cfg (excerpt)
+[components.transformer]
+factory = "transformer"
+max_batch_items = 4096
+
+[components.transformer.model]
+@architectures = "spacy-transformers.TransformerModel.v1"
+name = "bert-base-cased"
+tokenizer_config = {"use_fast": true}
+
+[components.transformer.model.get_spans]
+@span_getters = "doc_spans.v1"
+
+[components.transformer.annotation_setter]
+@annotation_setters = "spacy-transformer.null_annotation_setter.v1"
+
+```
+
+The `[components.transformer.model]` block describes the `model` argument passed
+to the transformer component. It's a Thinc
+[`Model`](https://thinc.ai/docs/api-model) object that will be passed into the
+component. Here, it references the function
+[spacy-transformers.TransformerModel.v1](/api/architectures#TransformerModel)
+registered in the [`architectures` registry](/api/top-level#registry). If a key
+in a block starts with `@`, it's **resolved to a function** and all other
+settings are passed to the function as arguments. In this case, `name`,
+`tokenizer_config` and `get_spans`.
+
+`get_spans` is a function that takes a batch of `Doc` object and returns lists
+of potentially overlapping `Span` objects to process by the transformer. Several
+[built-in functions](/api/transformer#span-getters) are available – for example,
+to process the whole document or individual sentences. When the config is
+resolved, the function is created and passed into the model as an argument.
+
+<Infobox variant="warning">
+
+Remember that the `config.cfg` used for training should contain **no missing
+values** and requires all settings to be defined. You don't want any hidden
+defaults creeping in and changing your results! spaCy will tell you if settings
+are missing, and you can run
+[`spacy init fill-config`](/api/cli#init-fill-config) to automatically fill in
+all defaults.
+
+</Infobox>
+
+### Customizing the settings {#transformers-training-custom-settings}
+
+To change any of the settings, you can edit the `config.cfg` and re-run the
+training. To change any of the functions, like the span getter, you can replace
+the name of the referenced function – e.g. `@span_getters = "sent_spans.v1"` to
+process sentences. You can also register your own functions using the
+`span_getters` registry:
+
+> #### config.cfg
+>
+> ```ini
+> [components.transformer.model.get_spans]
+> @span_getters = "custom_sent_spans"
+> ```
+
+```python
+### code.py
+import spacy_transformers
+
+@spacy_transformers.registry.span_getters("custom_sent_spans")
+def configure_custom_sent_spans():
+    # TODO: write custom example
+    def get_sent_spans(docs):
+        return [list(doc.sents) for doc in docs]
+
+    return get_sent_spans
+```
+
+To resolve the config during training, spaCy needs to know about your custom
+function. You can make it available via the `--code` argument that can point to
+a Python file. For more details on training with custom code, see the
+[training documentation](/usage/training#custom-code).
+
+```bash
+$ python -m spacy train ./config.cfg --code ./code.py
+```
+
+### Customizing the model implementations {#training-custom-model}
+
+The [`Transformer`](/api/transformer) component expects a Thinc
+[`Model`](https://thinc.ai/docs/api-model) object to be passed in as its `model`
+argument. You're not limited to the implementation provided by
+`spacy-transformers` – the only requirement is that your registered function
+must return an object of type ~~Model[List[Doc], FullTransformerBatch]~~: that
+is, a Thinc model that takes a list of [`Doc`](/api/doc) objects, and returns a
+[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) object with the
+transformer data.
+
+> #### Model type annotations
+>
+> In the documentation and code base, you may come across type annotations and
+> descriptions of [Thinc](https://thinc.ai) model types, like ~~Model[List[Doc],
+> List[Floats2d]]~~. This so-called generic type describes the layer and its
+> input and output type – in this case, it takes a list of `Doc` objects as the
+> input and list of 2-dimensional arrays of floats as the output. You can read
+> more about defining Thinc models [here](https://thinc.ai/docs/usage-models).
+> Also see the [type checking](https://thinc.ai/docs/usage-type-checking) for
+> how to enable linting in your editor to see live feedback if your inputs and
+> outputs don't match.
+
+The same idea applies to task models that power the **downstream components**.
+Most of spaCy's built-in model creation functions support a `tok2vec` argument,
+which should be a Thinc layer of type ~~Model[List[Doc], List[Floats2d]]~~. This
+is where we'll plug in our transformer model, using the
+[Tok2VecListener](/api/architectures#Tok2VecListener) layer, which sneakily
+delegates to the `Transformer` pipeline component.
+
+```ini
+### config.cfg (excerpt) {highlight="12"}
+[components.ner]
+factory = "ner"
+
+[nlp.pipeline.ner.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 3
+hidden_width = 128
+maxout_pieces = 3
+use_upper = false
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "spacy-transformers.Tok2VecListener.v1"
+grad_factor = 1.0
+
+[nlp.pipeline.ner.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+```
+
+The [Tok2VecListener](/api/architectures#Tok2VecListener) layer expects a
+[pooling layer](https://thinc.ai/docs/api-layers#reduction-ops) as the argument
+`pooling`, which needs to be of type ~~Model[Ragged, Floats2d]~~. This layer
+determines how the vector for each spaCy token will be computed from the zero or
+more source rows the token is aligned against. Here we use the
+[`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean) layer, which
+averages the wordpiece rows. We could instead use
+[`reduce_max`](https://thinc.ai/docs/api-layers#reduce_max), or a custom
+function you write yourself.
+
+You can have multiple components all listening to the same transformer model,
+and all passing gradients back to it. By default, all of the gradients will be
+**equally weighted**. You can control this with the `grad_factor` setting, which
+lets you reweight the gradients from the different listeners. For instance,
+setting `grad_factor = 0` would disable gradients from one of the listeners,
+while `grad_factor = 2.0` would multiply them by 2. This is similar to having a
+custom learning rate for each component. Instead of a constant, you can also
+provide a schedule, allowing you to freeze the shared parameters at the start of
+training.
+
+## Static vectors {#static-vectors}
+
+<!-- TODO: write -->
+
+### Using word vectors in your models {#word-vectors-models}
+
+Many neural network models are able to use word vector tables as additional
+features, which sometimes results in significant improvements in accuracy.
+spaCy's built-in embedding layer,
+[MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use
+word vector tables using the `also_use_static_vectors` flag. This setting is
+also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN)
+layer, which builds the default token-to-vector encoding architecture.
+
+```ini
+[tagger.model.tok2vec.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+width = 128
+rows = 7000
+also_embed_subwords = true
+also_use_static_vectors = true
+```
+
+<Infobox title="How it works" emoji="💡">
+
+The configuration system will look up the string `"spacy.MultiHashEmbed.v1"` in
+the `architectures` [registry](/api/top-level#registry), and call the returned
+object with the rest of the arguments from the block. This will result in a call
+to the
+[`MultiHashEmbed`](https://github.com/explosion/spacy/tree/develop/spacy/ml/models/tok2vec.py)
+function, which will return a [Thinc](https://thinc.ai) model object with the
+type signature ~~Model[List[Doc], List[Floats2d]]~~. Because the embedding layer
+takes a list of `Doc` objects as input, it does not need to store a copy of the
+vectors table. The vectors will be retrieved from the `Doc` objects that are
+passed in, via the `doc.vocab.vectors` attribute. This part of the process is
+handled by the [StaticVectors](/api/architectures#StaticVectors) layer.
+
+</Infobox>
+
+#### Creating a custom embedding layer {#custom-embedding-layer}
+
+The [MultiHashEmbed](/api/architectures#StaticVectors) layer is spaCy's
+recommended strategy for constructing initial word representations for your
+neural network models, but you can also implement your own. You can register any
+function to a string name, and then reference that function within your config
+(see the [training docs](/usage/training) for more details). To try this out,
+you can save the following little example to a new Python file:
+
+```python
+from spacy.ml.staticvectors import StaticVectors
+from spacy.util import registry
+
+print("I was imported!")
+
+@registry.architectures("my_example.MyEmbedding.v1")
+def MyEmbedding(output_width: int) -> Model[List[Doc], List[Floats2d]]:
+    print("I was called!")
+    return StaticVectors(nO=output_width)
+```
+
+If you pass the path to your file to the [`spacy train`](/api/cli#train) command
+using the `--code` argument, your file will be imported, which means the
+decorator registering the function will be run. Your function is now on equal
+footing with any of spaCy's built-ins, so you can drop it in instead of any
+other model with the same input and output signature. For instance, you could
+use it in the tagger model as follows:
+
+```ini
+[tagger.model.tok2vec.embed]
+@architectures = "my_example.MyEmbedding.v1"
+output_width = 128
+```
+
+Now that you have a custom function wired into the network, you can start
+implementing the logic you're interested in. For example, let's say you want to
+try a relatively simple embedding strategy that makes use of static word
+vectors, but combines them via summation with a smaller table of learned
+embeddings.
+
+```python
+from thinc.api import add, chain, remap_ids, Embed
+from spacy.ml.staticvectors import StaticVectors
+
+@registry.architectures("my_example.MyEmbedding.v1")
+def MyCustomVectors(
+    output_width: int,
+    vector_width: int,
+    embed_rows: int,
+    key2row: Dict[int, int]
+) -> Model[List[Doc], List[Floats2d]]:
+    return add(
+        StaticVectors(nO=output_width),
+        chain(
+           FeatureExtractor(["ORTH"]),
+           remap_ids(key2row),
+           Embed(nO=output_width, nV=embed_rows)
+        )
+    )
+```
+
+## Pretraining {#pretraining}
+
+<!-- TODO: write -->
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index ac922c4fa..325063e58 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -9,6 +9,7 @@ menu:
   - ['Tokenization', 'tokenization']
   - ['Merging & Splitting', 'retokenization']
   - ['Sentence Segmentation', 'sbd']
+  - ['Vectors & Similarity', 'vectors-similarity']
   - ['Language data', 'language-data']
 ---
 
@@ -1024,10 +1025,10 @@ produced by the tokenizer.
 >
 > If you're working with transformer models like BERT, check out the
 > [`spacy-transformers`](https://github.com/explosion/spacy-transformers)
-> extension package and [documentation](/usage/transformers). It includes a
-> pipeline component for using pretrained transformer weights and **training
-> transformer models** in spaCy, as well as helpful utilities for aligning word
-> pieces to linguistic tokenization.
+> extension package and [documentation](/usage/embeddings-transformers). It
+> includes a pipeline component for using pretrained transformer weights and
+> **training transformer models** in spaCy, as well as helpful utilities for
+> aligning word pieces to linguistic tokenization.
 
 ```python
 ### Custom BERT word piece tokenizer
@@ -1510,7 +1511,7 @@ adding it to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
 </Infobox>
 
 Here's an example of a component that implements a pre-processing rule for
-splitting on `'...'` tokens. The component is added before the parser, which is
+splitting on `"..."` tokens. The component is added before the parser, which is
 then used to further segment the text. That's possible, because `is_sent_start`
 is only set to `True` for some of the tokens – all others still specify `None`
 for unset sentence boundaries. This approach can be useful if you want to
@@ -1540,6 +1541,152 @@ doc = nlp(text)
 print("After:", [sent.text for sent in doc.sents])
 ```
 
+## Word vectors and semantic similarity {#vectors-similarity}
+
+import Vectors101 from 'usage/101/\_vectors-similarity.md'
+
+<Vectors101 />
+
+<Infobox title="What to expect from similarity results" variant="warning">
+
+Computing similarity scores can be helpful in many situations, but it's also
+important to maintain **realistic expectations** about what information it can
+provide. Words can be related to each over in many ways, so a single
+"similarity" score will always be a **mix of different signals**, and vectors
+trained on different data can produce very different results that may not be
+useful for your purpose.
+
+Also note that the similarity of `Doc` or `Span` objects defaults to the
+**average** of the token vectors. This means it's insensitive to the order of
+the words. Two documents expressing the same meaning with dissimilar wording
+will return a lower similarity score than two documents that happen to contain
+the same words while expressing different meanings.
+
+</Infobox>
+
+### Adding word vectors {#adding-vectors}
+
+Custom word vectors can be trained using a number of open-source libraries, such
+as [Gensim](https://radimrehurek.com/gensim), [Fast Text](https://fasttext.cc),
+or Tomas Mikolov's original
+[Word2vec implementation](https://code.google.com/archive/p/word2vec/). Most
+word vector libraries output an easy-to-read text-based format, where each line
+consists of the word followed by its vector. For everyday use, we want to
+convert the vectors model into a binary format that loads faster and takes up
+less space on disk. The easiest way to do this is the
+[`init model`](/api/cli#init-model) command-line utility. This will output a
+spaCy model in the directory `/tmp/la_vectors_wiki_lg`, giving you access to
+some nice Latin vectors. You can then pass the directory path to
+[`spacy.load`](/api/top-level#spacy.load).
+
+> #### Usage example
+>
+> ```python
+> nlp_latin = spacy.load("/tmp/la_vectors_wiki_lg")
+> doc1 = nlp_latin("Caecilius est in horto")
+> doc2 = nlp_latin("servus est in atrio")
+> doc1.similarity(doc2)
+> ```
+
+```bash
+wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz
+python -m spacy init model en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.vec.gz
+```
+
+<Accordion title="How to optimize vector coverage" id="custom-vectors-coverage" spaced>
+
+To help you strike a good balance between coverage and memory usage, spaCy's
+[`Vectors`](/api/vectors) class lets you map **multiple keys** to the **same
+row** of the table. If you're using the
+[`spacy init model`](/api/cli#init-model) command to create a vocabulary,
+pruning the vectors will be taken care of automatically if you set the
+`--prune-vectors` flag. You can also do it manually in the following steps:
+
+1. Start with a **word vectors model** that covers a huge vocabulary. For
+   instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg)
+   model provides 300-dimensional GloVe vectors for over 1 million terms of
+   English.
+2. If your vocabulary has values set for the `Lexeme.prob` attribute, the
+   lexemes will be sorted by descending probability to determine which vectors
+   to prune. Otherwise, lexemes will be sorted by their order in the `Vocab`.
+3. Call [`Vocab.prune_vectors`](/api/vocab#prune_vectors) with the number of
+   vectors you want to keep.
+
+```python
+nlp = spacy.load('en_vectors_web_lg')
+n_vectors = 105000  # number of vectors to keep
+removed_words = nlp.vocab.prune_vectors(n_vectors)
+
+assert len(nlp.vocab.vectors) <= n_vectors  # unique vectors have been pruned
+assert nlp.vocab.vectors.n_keys > n_vectors  # but not the total entries
+```
+
+[`Vocab.prune_vectors`](/api/vocab#prune_vectors) reduces the current vector
+table to a given number of unique entries, and returns a dictionary containing
+the removed words, mapped to `(string, score)` tuples, where `string` is the
+entry the removed word was mapped to, and `score` the similarity score between
+the two words.
+
+```python
+### Removed words
+{
+    "Shore": ("coast", 0.732257),
+    "Precautionary": ("caution", 0.490973),
+    "hopelessness": ("sadness", 0.742366),
+    "Continous": ("continuous", 0.732549),
+    "Disemboweled": ("corpse", 0.499432),
+    "biostatistician": ("scientist", 0.339724),
+    "somewheres": ("somewheres", 0.402736),
+    "observing": ("observe", 0.823096),
+    "Leaving": ("leaving", 1.0),
+}
+```
+
+In the example above, the vector for "Shore" was removed and remapped to the
+vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to
+the vector of "leaving", which is identical. If you're using the
+[`init model`](/api/cli#init-model) command, you can set the `--prune-vectors`
+option to easily reduce the size of the vectors as you add them to a spaCy
+model:
+
+```bash
+$ python -m spacy init model /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000
+```
+
+This will create a spaCy model with vectors for the first 10,000 words in the
+vectors model. All other words in the vectors model are mapped to the closest
+vector among those retained.
+
+</Accordion>
+
+### Adding vectors individually {#adding-individual-vectors}
+
+The `vector` attribute is a **read-only** numpy or cupy array (depending on
+whether you've configured spaCy to use GPU memory), with dtype `float32`. The
+array is read-only so that spaCy can avoid unnecessary copy operations where
+possible. You can modify the vectors via the [`Vocab`](/api/vocab) or
+[`Vectors`](/api/vectors) table. Using the
+[`Vocab.set_vector`](/api/vocab#set_vector) method is often the easiest approach
+if you have vectors in an arbitrary format, as you can read in the vectors with
+your own logic, and just set them with a simple loop. This method is likely to
+be slower than approaches that work with the whole vectors table at once, but
+it's a great approach for once-off conversions before you save out your model to
+disk.
+
+```python
+### Adding vectors
+from spacy.vocab import Vocab
+
+vector_data = {
+    "dog": numpy.random.uniform(-1, 1, (300,)),
+    "cat": numpy.random.uniform(-1, 1, (300,)),
+    "orange": numpy.random.uniform(-1, 1, (300,))
+}
+vocab = Vocab()
+for word, vector in vector_data.items():
+    vocab.set_vector(word, vector)
+```
+
 ## Language data {#language-data}
 
 import LanguageData101 from 'usage/101/\_language-data.md'
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 8df4b200d..2b040a832 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1,6 +1,6 @@
 ---
 title: Language Processing Pipelines
-next: /usage/vectors-embeddings
+next: /usage/embeddings-transformers
 menu:
   - ['Processing Text', 'processing']
   - ['How Pipelines Work', 'pipelines']
@@ -324,9 +324,9 @@ pretrained components and new components trained on your data.
 
 When reusing components across models, keep in mind that the **vocabulary**,
 **vectors** and model settings **must match**. If a pretrained model includes
-[word vectors](/usage/vectors-embeddings) and the component uses them as
-features, the model you copy it to needs to have the _same_ vectors available –
-otherwise, it won't be able to make the same predictions.
+[word vectors](/usage/linguistic-features#vectors-similarity) and the component
+uses them as features, the model you copy it to needs to have the _same_ vectors
+available – otherwise, it won't be able to make the same predictions.
 
 </Infobox>
 
@@ -1202,7 +1202,7 @@ document similarity method.
 Hooks let you customize some of the behaviors of the `Doc`, `Span` or `Token`
 objects by adding a component to the pipeline. For instance, to customize the
 [`Doc.similarity`](/api/doc#similarity) method, you can add a component that
-sets a custom function to `doc.user_hooks['similarity']`. The built-in
+sets a custom function to `doc.user_hooks["similarity"]`. The built-in
 `Doc.similarity` method will check the `user_hooks` dict, and delegate to your
 function if you've set one. Similar results can be achieved by setting functions
 to `Doc.user_span_hooks` and `Doc.user_token_hooks`.
diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md
index 49cdd96ea..df08e0320 100644
--- a/website/docs/usage/spacy-101.md
+++ b/website/docs/usage/spacy-101.md
@@ -247,7 +247,7 @@ import Vectors101 from 'usage/101/\_vectors-similarity.md'
 
 To learn more about word vectors, how to **customize them** and how to load
 **your own vectors** into spaCy, see the usage guide on
-[using word vectors and semantic similarities](/usage/vectors-embeddings).
+[using word vectors and semantic similarities](/usage/linguistic-features#vectors-similarity).
 
 </Infobox>
 
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index fc1624ec1..9f74cafac 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -30,7 +30,7 @@ ready-to-use spaCy models.
 
 </Infobox>
 
-## Quickstart {#quickstart}
+## Quickstart {#quickstart tag="new"}
 
 The recommended way to train your spaCy models is via the
 [`spacy train`](/api/cli#train) command on the command line. It only needs a
@@ -131,7 +131,7 @@ Some of the main advantages and features of spaCy's training config are:
   multiple components, define them once and reference them as
   [variables](#config-interpolation).
 - **Reproducibility with no hidden defaults.** The config file is the "single
-  source of truth" and includes all settings. <!-- TODO: explain this better -->
+  source of truth" and includes all settings.
 - **Automated checks and validation.** When you load a config, spaCy checks if
   the settings are complete and if all values have the correct types. This lets
   you catch potential mistakes early. In your custom architectures, you can use
@@ -667,7 +667,7 @@ visualize your model.
 
 For more details on how to integrate transformer models into your training
 config and customize the implementations, see the usage guide on
-[training transformers](/usage/transformers#training).
+[training transformers](/usage/embeddings-transformers#transformers-training).
 
 ### Pretraining with spaCy {#pretraining}
 
diff --git a/website/docs/usage/v2.md b/website/docs/usage/v2.md
index 59a842968..f7bcc17d3 100644
--- a/website/docs/usage/v2.md
+++ b/website/docs/usage/v2.md
@@ -218,7 +218,7 @@ available via `token.orth`.
 
 The new [`Vectors`](/api/vectors) class helps the `Vocab` manage the vectors
 assigned to strings, and lets you assign vectors individually, or
-[load in GloVe vectors](/usage/vectors-embeddings#custom-loading-glove) from a
+[load in GloVe vectors](/usage/linguistic-features#adding-vectors) from a
 directory. To help you strike a good balance between coverage and memory usage,
 the `Vectors` class lets you map **multiple keys** to the **same row** of the
 table. If you're using the [`spacy init-model`](/api/cli#init-model) command to
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 7213adf4a..fda5393a4 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -30,7 +30,7 @@ menu:
 
 <Infobox title="Details & Documentation" emoji="📖" list>
 
-- **Usage:** [Transformers](/usage/transformers),
+- **Usage:** [Embeddings & Transformers](/usage/embeddings-transformers),
   [Training models](/usage/training)
 - **API:** [`Transformer`](/api/transformer),
   [`TransformerData`](/api/transformer#transformerdata),
@@ -59,13 +59,13 @@ menu:
 
 ### New built-in pipeline components {#features-pipeline-components}
 
-| Name                                            | Description                                                                                                                                                                                                  |
-| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation.                                                                                                                                                               |
-| [`Morphologizer`](/api/morphologizer)           | Trainable component to predict morphological features.                                                                                                                                                       |
-| [`Lemmatizer`](/api/lemmatizer)                 | Standalone component for rule-based and lookup lemmatization.                                                                                                                                                |
-| [`AttributeRuler`](/api/attributeruler)         | Component for setting token attributes using match patterns.                                                                                                                                                 |
-| [`Transformer`](/api/transformer)               | Component for using [transformer models](/usage/transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
+| Name                                            | Description                                                                                                                                                                                                             |
+| ----------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation.                                                                                                                                                                          |
+| [`Morphologizer`](/api/morphologizer)           | Trainable component to predict morphological features.                                                                                                                                                                  |
+| [`Lemmatizer`](/api/lemmatizer)                 | Standalone component for rule-based and lookup lemmatization.                                                                                                                                                           |
+| [`AttributeRuler`](/api/attributeruler)         | Component for setting token attributes using match patterns.                                                                                                                                                            |
+| [`Transformer`](/api/transformer)               | Component for using [transformer models](/usage/embeddings-transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
 
 <Infobox title="Details & Documentation" emoji="📖" list>
 
diff --git a/website/docs/usage/vectors-embeddings.md b/website/docs/usage/vectors-embeddings.md
deleted file mode 100644
index 184436d12..000000000
--- a/website/docs/usage/vectors-embeddings.md
+++ /dev/null
@@ -1,340 +0,0 @@
----
-title: Vectors and Embeddings
-menu:
-  - ["What's a Word Vector?", 'whats-a-vector']
-  - ['Using Word Vectors', 'usage']
-  - ['Converting and Importing', 'converting']
-next: /usage/transformers
----
-
-Word vector tables (or "embeddings") let you find similar terms, and can improve
-the accuracy of some of your components. You can even use word vectors as a
-quick-and-dirty text-classification solution when you don't have any training data.
-Word vector tables are included in some of the spaCy [model packages](/models)
-we distribute, and you can easily create your own model packages with word
-vectors you train or download yourself.
-
-## What's a word vector? {#whats-a-vector}
-
-For spaCy's purposes, a "word vector" is a 1-dimensional slice from a
-2-dimensional **vectors table**, with a deterministic mapping from word types to
-rows in the table.
-
-```python
-def what_is_a_word_vector(
-    word_id: int,
-    key2row: Dict[int, int],
-    vectors_table: Floats2d,
-    *,
-    default_row: int=0
-) -> Floats1d:
-    return vectors_table[key2row.get(word_id, default_row)]
-```
-
-An old idea in linguistics is that you can "know a word by the company it
-keeps": that is, word meanings can be understood relationally, based on their
-patterns of usage. This idea inspired a branch of NLP research known as
-"distributional semantics" that has aimed to compute databases of lexical
-knowledge automatically. The [Word2vec](https://en.wikipedia.org/wiki/Word2vec)
-family of algorithms are a key milestone in this line of research. For
-simplicity, we will refer to a distributional word representation as a "word
-vector", and algorithms that computes word vectors (such as
-[GloVe](https://nlp.stanford.edu/projects/glove/),
-[FastText](https://fasttext.cc), etc.) as "Word2vec algorithms".
-
-Word2vec algorithms try to produce vectors tables that let you estimate useful
-relationships between words using simple linear algebra operations. For
-instance, you can often find close synonyms of a word by finding the vectors
-closest to it by cosine distance, and then finding the words that are mapped to
-those neighboring vectors. Word vectors can also be useful as features in
-statistical models.
-
-### Word vectors vs. contextual language models {#vectors-vs-language-models}
-
-The key difference between word vectors and contextual language models such
-as [transformers](/usage/transformers)
-is that word vectors model **lexical types**, rather than
-_tokens_. If you have a list of terms with no context around them,
-a transformer model like BERT can't really help you. BERT is designed to understand
-language **in context**, which isn't what you have. A word vectors table will be
-a much better fit for your task. However, if you do have words in context — whole
-sentences or paragraphs of running text — word vectors will only provide a very
-rough approximation of what the text is about.
-
-Word vectors are also very computationally efficient, as they map a word to a
-vector with a single indexing operation. Word vectors are therefore useful as a
-way to **improve the accuracy** of neural network models, especially models that
-are small or have received little or no pretraining. In spaCy, word vector
-tables are only used as **static features**. spaCy does not backpropagate
-gradients to the pretrained word vectors table. The static vectors table is
-usually used in combination with a smaller table of learned task-specific
-embeddings.
-
-## Using word vectors {#usage}
-
-spaCy stores word vector information in the
-[`Vocab.vectors`](/api/vocab#attributes) attribute, so you can access the whole
-vectors table from most spaCy objects. You can also access the vector for a
-[`Doc`](/api/doc), [`Span`](/api/span), [`Token`](/api/token) or
-[`Lexeme`](/api/lexeme) instance via the `vector` attribute. If your `Doc` or
-`Span` has multiple tokens, the average of the word vectors will be returned,
-excluding any "out of vocabulary" entries that have no vector available. If none
-of the words have a vector, a zeroed vector will be returned.
-
-The `vector` attribute is a **read-only** numpy or cupy array (depending on
-whether you've configured spaCy to use GPU memory), with dtype `float32`. The
-array is read-only so that spaCy can avoid unnecessary copy operations where
-possible. You can modify the vectors via the `Vocab` or `Vectors` table.
-
-### Word vectors and similarity
-
-A common use-case of word vectors is to answer _similarity questions_. You can
-ask how similar a `token`, `span`, `doc` or `lexeme` is to another object using
-the `.similarity()` method. You can even check the similarity of mismatched
-types, asking how similar a whole document is to a particular word, how similar
-a span is to a document, etc. By default, the `.similarity()` method will use
-return the cosine of the `.vector` attribute of the two objects being compared.
-You can customize this behavior by setting one or more
-[user hooks](/usage/processing-pipelines#custom-components-user-hooks) for the
-types you want to customize.
-
-Word vector similarity is a practical technique for many situations, especially
-since it's easy to use and relatively efficient to compute. However, it's
-important to maintain realistic expectations about what information it can
-provide. Words can be related to each over in many ways, so a single
-"similarity" score will always be a mix of different signals. The word vectors
-model is also not trained for your specific use-case, so you have no way of
-telling it which results are more or less useful for your purpose. These
-problems are even more accute when you go from measuring the similarity of
-single words to the similarity of spans or documents. The vector averaging
-process is insensitive to the order of the words, so `doc1.similarity(doc2)`
-will mostly be based on the overlap in lexical items between the two documents
-objects. Two documents expressing the same meaning with dissimilar wording will
-return a lower similarity score than two documents that happen to contain the
-same words while expressing different meanings.
-
-### Using word vectors in your models
-
-Many neural network models are able to use word vector tables as additional
-features, which sometimes results in significant improvements in accuracy.
-spaCy's built-in embedding layer, `spacy.MultiHashEmbed.v1`, can be configured
-to use word vector tables using the `also_use_static_vectors` flag. This
-setting is also available on the `spacy.MultiHashEmbedCNN.v1` layer, which
-builds the default token-to-vector encoding architecture.
-
-```
-[tagger.model.tok2vec.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
-width = 128
-rows = 7000
-also_embed_subwords = true
-also_use_static_vectors = true
-```
-
-<Infobox title="How it works">
-The configuration system will look up the string `spacy.MultiHashEmbed.v1`
-in the `architectures` registry, and call the returned object with the
-rest of the arguments from the block. This will result in a call to the
-`spacy.ml.models.tok2vec.MultiHashEmbed` function, which will return
-a Thinc model object with the type signature `Model[List[Doc],
-List[Floats2d]]`. Because the embedding layer takes a list of `Doc` objects as
-input, it does not need to store a copy of the vectors table. The vectors will
-be retrieved from the `Doc` objects that are passed in, via the
-`doc.vocab.vectors` attribute. This part of the process is handled by the
-`spacy.ml.staticvectors.StaticVectors` layer.
-</Infobox>
-
-#### Creating a custom embedding layer
-
-The `MultiHashEmbed` layer is spaCy's recommended strategy for constructing
-initial word representations for your neural network models, but you can also
-implement your own. You can register any function to a string name, and then
-reference that function within your config (see the [training]("/usage/training")
-section for more details). To try this out, you can save the following little
-example to a new Python file:
-
-```
-from spacy.ml.staticvectors import StaticVectors
-from spacy.util import registry
-
-print("I was imported!")
-
-@registry.architectures("my_example.MyEmbedding.v1")
-def MyEmbedding(output_width: int) -> Model[List[Doc], List[Floats2d]]:
-    print("I was called!")
-    return StaticVectors(nO=output_width)
-```
-
-If you pass the path to your file to the `spacy train` command using the `-c`
-argument, your file will be imported, which means the decorator registering the
-function will be run. Your function is now on equal footing with any of spaCy's
-built-ins, so you can drop it in instead of any other model with the same input
-and output signature. For instance, you could use it in the tagger model as
-follows:
-
-```
-[tagger.model.tok2vec.embed]
-@architectures = "my_example.MyEmbedding.v1"
-output_width = 128
-```
-
-Now that you have a custom function wired into the network, you can start
-implementing the logic you're interested in. For example, let's say you want to
-try a relatively simple embedding strategy that makes use of static word vectors,
-but combines them via summation with a smaller table of learned embeddings.
-
-```python
-from thinc.api import add, chain, remap_ids, Embed
-from spacy.ml.staticvectors import StaticVectors
-
-@registry.architectures("my_example.MyEmbedding.v1")
-def MyCustomVectors(
-    output_width: int,
-    vector_width: int,
-    embed_rows: int,
-    key2row: Dict[int, int]
-) -> Model[List[Doc], List[Floats2d]]:
-    return add(
-        StaticVectors(nO=output_width),
-        chain(
-           FeatureExtractor(["ORTH"]),
-           remap_ids(key2row),
-           Embed(nO=output_width, nV=embed_rows)
-        )
-    )
-```
-
-#### When should you add word vectors to your model?
-        
-Word vectors are not compatible with most [transformer models](/usage/transformers),
-but if you're training another type of NLP network, it's almost always worth
-adding word vectors to your model. As well as improving your final accuracy,
-word vectors often make experiments more consistent, as the accuracy you
-reach will be less sensitive to how the network is randomly initialized. High
-variance due to random chance can slow down your progress significantly, as you
-need to run many experiments to filter the signal from the noise.
-
-Word vector features need to be enabled prior to training, and the same word vectors
-table will need to be available at runtime as well. You cannot add word vector
-features once the model has already been trained, and you usually cannot
-replace one word vectors table with another without causing a significant loss
-of performance.
-
-## Converting word vectors for use in spaCy {#converting}
-
-Custom word vectors can be trained using a number of open-source libraries, such
-as [Gensim](https://radimrehurek.com/gensim), [Fast Text](https://fasttext.cc),
-or Tomas Mikolov's original
-[Word2vec implementation](https://code.google.com/archive/p/word2vec/). Most
-word vector libraries output an easy-to-read text-based format, where each line
-consists of the word followed by its vector. For everyday use, we want to
-convert the vectors model into a binary format that loads faster and takes up
-less space on disk. The easiest way to do this is the
-[`init-model`](/api/cli#init-model) command-line utility:
-
-```bash
-wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz
-python -m spacy init-model en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.vec.gz
-```
-
-This will output a spaCy model in the directory `/tmp/la_vectors_wiki_lg`,
-giving you access to some nice Latin vectors 😉 You can then pass the directory
-path to [`spacy.load()`](/api/top-level#spacy.load).
-
-```python
-nlp_latin = spacy.load("/tmp/la_vectors_wiki_lg")
-doc1 = nlp_latin("Caecilius est in horto")
-doc2 = nlp_latin("servus est in atrio")
-doc1.similarity(doc2)
-```
-
-The model directory will have a `/vocab` directory with the strings, lexical
-entries and word vectors from the input vectors model. The
-[`init-model`](/api/cli#init-model) command supports a number of archive formats
-for the word vectors: the vectors can be in plain text (`.txt`), zipped
-(`.zip`), or tarred and zipped (`.tgz`).
-
-### Optimizing vector coverage {#custom-vectors-coverage new="2"}
-
-To help you strike a good balance between coverage and memory usage, spaCy's
-[`Vectors`](/api/vectors) class lets you map **multiple keys** to the **same
-row** of the table. If you're using the
-[`spacy init-model`](/api/cli#init-model) command to create a vocabulary,
-pruning the vectors will be taken care of automatically if you set the
-`--prune-vectors` flag. You can also do it manually in the following steps:
-
-1. Start with a **word vectors model** that covers a huge vocabulary. For
-   instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg)
-   model provides 300-dimensional GloVe vectors for over 1 million terms of
-   English.
-2. If your vocabulary has values set for the `Lexeme.prob` attribute, the
-   lexemes will be sorted by descending probability to determine which vectors
-   to prune. Otherwise, lexemes will be sorted by their order in the `Vocab`.
-3. Call [`Vocab.prune_vectors`](/api/vocab#prune_vectors) with the number of
-   vectors you want to keep.
-
-```python
-nlp = spacy.load('en_vectors_web_lg')
-n_vectors = 105000  # number of vectors to keep
-removed_words = nlp.vocab.prune_vectors(n_vectors)
-
-assert len(nlp.vocab.vectors) <= n_vectors  # unique vectors have been pruned
-assert nlp.vocab.vectors.n_keys > n_vectors  # but not the total entries
-```
-
-[`Vocab.prune_vectors`](/api/vocab#prune_vectors) reduces the current vector
-table to a given number of unique entries, and returns a dictionary containing
-the removed words, mapped to `(string, score)` tuples, where `string` is the
-entry the removed word was mapped to, and `score` the similarity score between
-the two words.
-
-```python
-### Removed words
-{
-    "Shore": ("coast", 0.732257),
-    "Precautionary": ("caution", 0.490973),
-    "hopelessness": ("sadness", 0.742366),
-    "Continous": ("continuous", 0.732549),
-    "Disemboweled": ("corpse", 0.499432),
-    "biostatistician": ("scientist", 0.339724),
-    "somewheres": ("somewheres", 0.402736),
-    "observing": ("observe", 0.823096),
-    "Leaving": ("leaving", 1.0),
-}
-```
-
-In the example above, the vector for "Shore" was removed and remapped to the
-vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to
-the vector of "leaving", which is identical. If you're using the
-[`init-model`](/api/cli#init-model) command, you can set the `--prune-vectors`
-option to easily reduce the size of the vectors as you add them to a spaCy
-model:
-
-```bash
-$ python -m spacy init-model /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000
-```
-
-This will create a spaCy model with vectors for the first 10,000 words in the
-vectors model. All other words in the vectors model are mapped to the closest
-vector among those retained.
-
-### Adding vectors {#adding-vectors}
-
-You can also add word vectors individually, using the method `vocab.set_vector`.
-This is often the easiest approach if you have vectors in an arbitrary format,
-as you can read in the vectors with your own logic, and just set them with
-a simple loop. This method is likely to be slower than approaches that work
-with the whole vectors table at once, but it's a great approach for once-off
-conversions before you save out your model to disk.
-
-```python
-### Adding vectors
-from spacy.vocab import Vocab
-
-vector_data = {"dog": numpy.random.uniform(-1, 1, (300,)),
-               "cat": numpy.random.uniform(-1, 1, (300,)),
-               "orange": numpy.random.uniform(-1, 1, (300,))}
-vocab = Vocab()
-for word, vector in vector_data.items():
-    vocab.set_vector(word, vector)
-```
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 6f8763955..c830619c5 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -18,8 +18,11 @@
                     { "text": "Linguistic Features", "url": "/usage/linguistic-features" },
                     { "text": "Rule-based Matching", "url": "/usage/rule-based-matching" },
                     { "text": "Processing Pipelines", "url": "/usage/processing-pipelines" },
-                    { "text": "Vectors & Embeddings", "url": "/usage/vectors-embeddings" },
-                    { "text": "Transformers", "url": "/usage/transformers", "tag": "new" },
+                    {
+                        "text": "Embeddings & Transformers",
+                        "url": "/usage/embeddings-transformers",
+                        "tag": "new"
+                    },
                     { "text": "Training Models", "url": "/usage/training", "tag": "new" },
                     { "text": "spaCy Projects", "url": "/usage/projects", "tag": "new" },
                     { "text": "Saving & Loading", "url": "/usage/saving-loading" },
diff --git a/website/src/components/typography.js b/website/src/components/typography.js
index 41464473f..d37c345b9 100644
--- a/website/src/components/typography.js
+++ b/website/src/components/typography.js
@@ -9,7 +9,12 @@ import { isString, github, headingTextClassName } from './util'
 import classes from '../styles/typography.module.sass'
 
 export const H1 = ({ Component = 'h1', className, ...props }) => (
-    <Headline Component={Component} className={classNames(classes.h1, className)} {...props} />
+    <Headline
+        Component={Component}
+        className={classNames(classes.h1, className)}
+        permalink={false}
+        {...props}
+    />
 )
 export const H2 = ({ className, ...props }) => (
     <Headline Component="h2" className={classNames(classes.h2, className)} {...props} />
@@ -90,6 +95,7 @@ const Headline = ({
     source,
     hidden,
     action,
+    permalink = true,
     className,
     children,
 }) => {
@@ -102,7 +108,7 @@ const Headline = ({
     const tags = tag ? tag.split(',').map(t => t.trim()) : []
     return (
         <Component id={id} name={name} className={headingClassNames}>
-            <Permalink id={id}>{children} </Permalink>
+            <Permalink id={permalink ? id : null}>{children} </Permalink>
             {tags.map((tag, i) => (
                 <Tag spaced key={i}>
                     {tag}