From 728fec0194224ef2d58172511dc087c274b57e4e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 18 Aug 2020 00:49:19 +0200 Subject: [PATCH] Update docs [ci skip] --- netlify.toml | 4 +- website/docs/api/architectures.md | 6 +- website/docs/api/cli.md | 12 +- website/docs/api/top-level.md | 2 +- website/docs/api/transformer.md | 3 +- website/docs/usage/101/_vectors-similarity.md | 14 +- website/docs/usage/embeddings-transformers.md | 459 ++++++++++++++++++ website/docs/usage/linguistic-features.md | 157 +++++- website/docs/usage/processing-pipelines.md | 10 +- website/docs/usage/spacy-101.md | 2 +- website/docs/usage/training.md | 6 +- website/docs/usage/v2.md | 2 +- website/docs/usage/v3.md | 16 +- website/docs/usage/vectors-embeddings.md | 340 ------------- website/meta/sidebars.json | 7 +- website/src/components/typography.js | 10 +- 16 files changed, 665 insertions(+), 385 deletions(-) create mode 100644 website/docs/usage/embeddings-transformers.md delete mode 100644 website/docs/usage/vectors-embeddings.md diff --git a/netlify.toml b/netlify.toml index 6afa5ed7e..2f3e350e6 100644 --- a/netlify.toml +++ b/netlify.toml @@ -36,11 +36,11 @@ redirects = [ {from = "/docs/api/features", to = "/models/#architecture", force = true}, {from = "/docs/api/philosophy", to = "/usage/spacy-101", force = true}, {from = "/docs/usage/showcase", to = "/universe", force = true}, - {from = "/tutorials/load-new-word-vectors", to = "/usage/vectors-similarity#custom", force = true}, + {from = "/tutorials/load-new-word-vectors", to = "/usage/linguistic-features", force = true}, {from = "/tutorials", to = "/usage/examples", force = true}, # Old documentation pages (v2.x) {from = "/usage/adding-languages", to = "/usage/linguistic-features", force = true}, - {from = "/usage/vectors-similarity", to = "/usage/vectors-embeddings", force = true}, + {from = "/usage/vectors-similarity", to = "/usage/linguistic-features#vectors-similarity", force = true}, {from = "/api/goldparse", to = "/api/top-level", force = true}, {from = "/api/goldcorpus", to = "/api/corpus", force = true}, {from = "/api/annotation", to = "/api/data-formats", force = true}, diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 3bc2ab578..8bb5cdeea 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -243,11 +243,15 @@ Encode context using bidirectional LSTM layers. Requires | `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ | | `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ | +### spacy.StaticVectors.v1 {#StaticVectors} + + + ## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"} The following architectures are provided by the package [`spacy-transformers`](https://github.com/explosion/spacy-transformers). See the -[usage documentation](/usage/transformers) for how to integrate the +[usage documentation](/usage/embeddings-transformers) for how to integrate the architectures into your training config. ### spacy-transformers.TransformerModel.v1 {#TransformerModel} diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index ec61eb0b5..b614898df 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -162,14 +162,12 @@ $ python -m spacy init fill-config [base_path] [output_file] [--diff] ### init model {#init-model new="2" tag="command"} - - Create a new model directory from raw data, like word frequencies, Brown -clusters and word vectors. This command is similar to the `spacy model` command -in v1.x. Note that in order to populate the model's vocab, you need to pass in a -JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as -`--jsonl-loc` with optional `id` values that correspond to the vectors table. -Just loading in vectors will not automatically populate the vocab. +clusters and word vectors. Note that in order to populate the model's vocab, you +need to pass in a JSONL-formatted +[vocabulary file](/api/data-formats#vocab-jsonl) as `--jsonl-loc` with optional +`id` values that correspond to the vectors table. Just loading in vectors will +not automatically populate the vocab. diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 3bce3db93..0f87b8fd0 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -316,7 +316,7 @@ factories. The following registries are added by the [`spacy-transformers`](https://github.com/explosion/spacy-transformers) package. See the [`Transformer`](/api/transformer) API reference and -[usage docs](/usage/transformers) for details. +[usage docs](/usage/embeddings-transformers) for details. > #### Example > diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index 19cb4daa2..d4d7de161 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -41,7 +41,8 @@ token, the spaCy token receives the sum of their values. To access the values, you can use the custom [`Doc._.trf_data`](#custom-attributes) attribute. The package also adds the function registries [`@span_getters`](#span_getters) and [`@annotation_setters`](#annotation_setters) with several built-in registered -functions. For more details, see the [usage documentation](/usage/transformers). +functions. For more details, see the +[usage documentation](/usage/embeddings-transformers). ## Config and implementation {#config} diff --git a/website/docs/usage/101/_vectors-similarity.md b/website/docs/usage/101/_vectors-similarity.md index 9ff55f815..a04c96236 100644 --- a/website/docs/usage/101/_vectors-similarity.md +++ b/website/docs/usage/101/_vectors-similarity.md @@ -77,12 +77,14 @@ or flagging duplicates. For example, you can suggest a user content that's similar to what they're currently looking at, or label a support ticket as a duplicate if it's very similar to an already existing one. -Each `Doc`, `Span` and `Token` comes with a -[`.similarity()`](/api/token#similarity) method that lets you compare it with -another object, and determine the similarity. Of course similarity is always -subjective – whether "dog" and "cat" are similar really depends on how you're -looking at it. spaCy's similarity model usually assumes a pretty general-purpose -definition of similarity. +Each [`Doc`](/api/doc), [`Span`](/api/span), [`Token`](/api/token) and +[`Lexeme`](/api/lexeme) comes with a [`.similarity`](/api/token#similarity) +method that lets you compare it with another object, and determine the +similarity. Of course similarity is always subjective – whether "dog" and "cat" +are similar really depends on how you're looking at it. spaCy's similarity model +usually assumes a pretty general-purpose definition of similarity. + + ```python ### {executable="true"} diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md new file mode 100644 index 000000000..23037a3ab --- /dev/null +++ b/website/docs/usage/embeddings-transformers.md @@ -0,0 +1,459 @@ +--- +title: Embeddings, Transformers and Transfer Learning +teaser: Using transformer embeddings like BERT in spaCy +menu: + - ['Embedding Layers', 'embedding-layers'] + - ['Transformers', 'transformers'] + - ['Static Vectors', 'static-vectors'] + - ['Pretraining', 'pretraining'] +next: /usage/training +--- + + + +## Shared embedding layers {#embedding-layers} + + + + + +The key difference between [word vectors](#word-vectors) and contextual language +models such as [transformers](#transformers) is that word vectors model +**lexical types**, rather than _tokens_. If you have a list of terms with no +context around them, a transformer model like BERT can't really help you. BERT +is designed to understand language **in context**, which isn't what you have. A +word vectors table will be a much better fit for your task. However, if you do +have words in context — whole sentences or paragraphs of running text — word +vectors will only provide a very rough approximation of what the text is about. + +Word vectors are also very computationally efficient, as they map a word to a +vector with a single indexing operation. Word vectors are therefore useful as a +way to **improve the accuracy** of neural network models, especially models that +are small or have received little or no pretraining. In spaCy, word vector +tables are only used as **static features**. spaCy does not backpropagate +gradients to the pretrained word vectors table. The static vectors table is +usually used in combination with a smaller table of learned task-specific +embeddings. + + + + + +Word vectors are not compatible with most [transformer models](#transformers), +but if you're training another type of NLP network, it's almost always worth +adding word vectors to your model. As well as improving your final accuracy, +word vectors often make experiments more consistent, as the accuracy you reach +will be less sensitive to how the network is randomly initialized. High variance +due to random chance can slow down your progress significantly, as you need to +run many experiments to filter the signal from the noise. + +Word vector features need to be enabled prior to training, and the same word +vectors table will need to be available at runtime as well. You cannot add word +vector features once the model has already been trained, and you usually cannot +replace one word vectors table with another without causing a significant loss +of performance. + + + +## Using transformer models {#transformers} + +Transformers are a family of neural network architectures that compute **dense, +context-sensitive representations** for the tokens in your documents. Downstream +models in your pipeline can then use these representations as input features to +**improve their predictions**. You can connect multiple components to a single +transformer model, with any or all of those components giving feedback to the +transformer to fine-tune it to your tasks. spaCy's transformer support +interoperates with [PyTorch](https://pytorch.org) and the +[HuggingFace `transformers`](https://huggingface.co/transformers/) library, +giving you access to thousands of pretrained models for your pipelines. There +are many [great guides](http://jalammar.github.io/illustrated-transformer/) to +transformer models, but for practical purposes, you can simply think of them as +a drop-in replacement that let you achieve **higher accuracy** in exchange for +**higher training and runtime costs**. + +### Setup and installation {#transformers-installation} + +> #### System requirements +> +> We recommend an NVIDIA **GPU** with at least **10GB of memory** in order to +> work with transformer models. Make sure your GPU drivers are up to date and +> you have **CUDA v9+** installed. + +> The exact requirements will depend on the transformer model. Training a +> transformer-based model without a GPU will be too slow for most practical +> purposes. +> +> Provisioning a new machine will require about **5GB** of data to be +> downloaded: 3GB CUDA runtime, 800MB PyTorch, 400MB CuPy, 500MB weights, 200MB +> spaCy and dependencies. + +Once you have CUDA installed, you'll need to install two pip packages, +[`cupy`](https://docs.cupy.dev/en/stable/install.html) and +[`spacy-transformers`](https://github.com/explosion/spacy-transformers). `cupy` +is just like `numpy`, but for GPU. The best way to install it is to choose a +wheel that matches the version of CUDA you're using. You may also need to set +the `CUDA_PATH` environment variable if your CUDA runtime is installed in a +non-standard location. Putting it all together, if you had installed CUDA 10.2 +in `/opt/nvidia/cuda`, you would run: + +```bash +### Installation with CUDA +export CUDA_PATH="/opt/nvidia/cuda" +pip install cupy-cuda102 +pip install spacy-transformers +``` + +### Runtime usage {#transformers-runtime} + +Transformer models can be used as **drop-in replacements** for other types of +neural networks, so your spaCy pipeline can include them in a way that's +completely invisible to the user. Users will download, load and use the model in +the standard way, like any other spaCy pipeline. Instead of using the +transformers as subnetworks directly, you can also use them via the +[`Transformer`](/api/transformer) pipeline component. + +![The processing pipeline with the transformer component](../images/pipeline_transformer.svg) + +The `Transformer` component sets the +[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute, +which lets you access the transformers outputs at runtime. + +```bash +$ python -m spacy download en_core_trf_lg +``` + +```python +### Example +import spacy +from thinc.api import use_pytorch_for_gpu_memory, require_gpu + +# Use the GPU, with memory allocations directed via PyTorch. +# This prevents out-of-memory errors that would otherwise occur from competing +# memory pools. +use_pytorch_for_gpu_memory() +require_gpu(0) + +nlp = spacy.load("en_core_trf_lg") +for doc in nlp.pipe(["some text", "some other text"]): + tokvecs = doc._.trf_data.tensors[-1] +``` + +You can also customize how the [`Transformer`](/api/transformer) component sets +annotations onto the [`Doc`](/api/doc), by customizing the `annotation_setter`. +This callback will be called with the raw input and output data for the whole +batch, along with the batch of `Doc` objects, allowing you to implement whatever +you need. The annotation setter is called with a batch of [`Doc`](/api/doc) +objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) +containing the transformers data for the batch. + +```python +def custom_annotation_setter(docs, trf_data): + # TODO: + ... + +nlp = spacy.load("en_core_trf_lg") +nlp.get_pipe("transformer").annotation_setter = custom_annotation_setter +doc = nlp("This is a text") +print() # TODO: +``` + +### Training usage {#transformers-training} + +The recommended workflow for training is to use spaCy's +[config system](/usage/training#config), usually via the +[`spacy train`](/api/cli#train) command. The training config defines all +component settings and hyperparameters in one place and lets you describe a tree +of objects by referring to creation functions, including functions you register +yourself. For details on how to get started with training your own model, check +out the [training quickstart](/usage/training#quickstart). + + + +The easiest way to get started is to clone a transformers-based project +template. Swap in your data, edit the settings and hyperparameters and train, +evaluate, package and visualize your model. + + + +The `[components]` section in the [`config.cfg`](/api/data-formats#config) +describes the pipeline components and the settings used to construct them, +including their model implementation. Here's a config snippet for the +[`Transformer`](/api/transformer) component, along with matching Python code. In +this case, the `[components.transformer]` block describes the `transformer` +component: + +> #### Python equivalent +> +> ```python +> from spacy_transformers import Transformer, TransformerModel +> from spacy_transformers.annotation_setters import null_annotation_setter +> from spacy_transformers.span_getters import get_doc_spans +> +> trf = Transformer( +> nlp.vocab, +> TransformerModel( +> "bert-base-cased", +> get_spans=get_doc_spans, +> tokenizer_config={"use_fast": True}, +> ), +> annotation_setter=null_annotation_setter, +> max_batch_items=4096, +> ) +> ``` + +```ini +### config.cfg (excerpt) +[components.transformer] +factory = "transformer" +max_batch_items = 4096 + +[components.transformer.model] +@architectures = "spacy-transformers.TransformerModel.v1" +name = "bert-base-cased" +tokenizer_config = {"use_fast": true} + +[components.transformer.model.get_spans] +@span_getters = "doc_spans.v1" + +[components.transformer.annotation_setter] +@annotation_setters = "spacy-transformer.null_annotation_setter.v1" + +``` + +The `[components.transformer.model]` block describes the `model` argument passed +to the transformer component. It's a Thinc +[`Model`](https://thinc.ai/docs/api-model) object that will be passed into the +component. Here, it references the function +[spacy-transformers.TransformerModel.v1](/api/architectures#TransformerModel) +registered in the [`architectures` registry](/api/top-level#registry). If a key +in a block starts with `@`, it's **resolved to a function** and all other +settings are passed to the function as arguments. In this case, `name`, +`tokenizer_config` and `get_spans`. + +`get_spans` is a function that takes a batch of `Doc` object and returns lists +of potentially overlapping `Span` objects to process by the transformer. Several +[built-in functions](/api/transformer#span-getters) are available – for example, +to process the whole document or individual sentences. When the config is +resolved, the function is created and passed into the model as an argument. + + + +Remember that the `config.cfg` used for training should contain **no missing +values** and requires all settings to be defined. You don't want any hidden +defaults creeping in and changing your results! spaCy will tell you if settings +are missing, and you can run +[`spacy init fill-config`](/api/cli#init-fill-config) to automatically fill in +all defaults. + + + +### Customizing the settings {#transformers-training-custom-settings} + +To change any of the settings, you can edit the `config.cfg` and re-run the +training. To change any of the functions, like the span getter, you can replace +the name of the referenced function – e.g. `@span_getters = "sent_spans.v1"` to +process sentences. You can also register your own functions using the +`span_getters` registry: + +> #### config.cfg +> +> ```ini +> [components.transformer.model.get_spans] +> @span_getters = "custom_sent_spans" +> ``` + +```python +### code.py +import spacy_transformers + +@spacy_transformers.registry.span_getters("custom_sent_spans") +def configure_custom_sent_spans(): + # TODO: write custom example + def get_sent_spans(docs): + return [list(doc.sents) for doc in docs] + + return get_sent_spans +``` + +To resolve the config during training, spaCy needs to know about your custom +function. You can make it available via the `--code` argument that can point to +a Python file. For more details on training with custom code, see the +[training documentation](/usage/training#custom-code). + +```bash +$ python -m spacy train ./config.cfg --code ./code.py +``` + +### Customizing the model implementations {#training-custom-model} + +The [`Transformer`](/api/transformer) component expects a Thinc +[`Model`](https://thinc.ai/docs/api-model) object to be passed in as its `model` +argument. You're not limited to the implementation provided by +`spacy-transformers` – the only requirement is that your registered function +must return an object of type ~~Model[List[Doc], FullTransformerBatch]~~: that +is, a Thinc model that takes a list of [`Doc`](/api/doc) objects, and returns a +[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) object with the +transformer data. + +> #### Model type annotations +> +> In the documentation and code base, you may come across type annotations and +> descriptions of [Thinc](https://thinc.ai) model types, like ~~Model[List[Doc], +> List[Floats2d]]~~. This so-called generic type describes the layer and its +> input and output type – in this case, it takes a list of `Doc` objects as the +> input and list of 2-dimensional arrays of floats as the output. You can read +> more about defining Thinc models [here](https://thinc.ai/docs/usage-models). +> Also see the [type checking](https://thinc.ai/docs/usage-type-checking) for +> how to enable linting in your editor to see live feedback if your inputs and +> outputs don't match. + +The same idea applies to task models that power the **downstream components**. +Most of spaCy's built-in model creation functions support a `tok2vec` argument, +which should be a Thinc layer of type ~~Model[List[Doc], List[Floats2d]]~~. This +is where we'll plug in our transformer model, using the +[Tok2VecListener](/api/architectures#Tok2VecListener) layer, which sneakily +delegates to the `Transformer` pipeline component. + +```ini +### config.cfg (excerpt) {highlight="12"} +[components.ner] +factory = "ner" + +[nlp.pipeline.ner.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 3 +hidden_width = 128 +maxout_pieces = 3 +use_upper = false + +[nlp.pipeline.ner.model.tok2vec] +@architectures = "spacy-transformers.Tok2VecListener.v1" +grad_factor = 1.0 + +[nlp.pipeline.ner.model.tok2vec.pooling] +@layers = "reduce_mean.v1" +``` + +The [Tok2VecListener](/api/architectures#Tok2VecListener) layer expects a +[pooling layer](https://thinc.ai/docs/api-layers#reduction-ops) as the argument +`pooling`, which needs to be of type ~~Model[Ragged, Floats2d]~~. This layer +determines how the vector for each spaCy token will be computed from the zero or +more source rows the token is aligned against. Here we use the +[`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean) layer, which +averages the wordpiece rows. We could instead use +[`reduce_max`](https://thinc.ai/docs/api-layers#reduce_max), or a custom +function you write yourself. + +You can have multiple components all listening to the same transformer model, +and all passing gradients back to it. By default, all of the gradients will be +**equally weighted**. You can control this with the `grad_factor` setting, which +lets you reweight the gradients from the different listeners. For instance, +setting `grad_factor = 0` would disable gradients from one of the listeners, +while `grad_factor = 2.0` would multiply them by 2. This is similar to having a +custom learning rate for each component. Instead of a constant, you can also +provide a schedule, allowing you to freeze the shared parameters at the start of +training. + +## Static vectors {#static-vectors} + + + +### Using word vectors in your models {#word-vectors-models} + +Many neural network models are able to use word vector tables as additional +features, which sometimes results in significant improvements in accuracy. +spaCy's built-in embedding layer, +[MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use +word vector tables using the `also_use_static_vectors` flag. This setting is +also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN) +layer, which builds the default token-to-vector encoding architecture. + +```ini +[tagger.model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = 128 +rows = 7000 +also_embed_subwords = true +also_use_static_vectors = true +``` + + + +The configuration system will look up the string `"spacy.MultiHashEmbed.v1"` in +the `architectures` [registry](/api/top-level#registry), and call the returned +object with the rest of the arguments from the block. This will result in a call +to the +[`MultiHashEmbed`](https://github.com/explosion/spacy/tree/develop/spacy/ml/models/tok2vec.py) +function, which will return a [Thinc](https://thinc.ai) model object with the +type signature ~~Model[List[Doc], List[Floats2d]]~~. Because the embedding layer +takes a list of `Doc` objects as input, it does not need to store a copy of the +vectors table. The vectors will be retrieved from the `Doc` objects that are +passed in, via the `doc.vocab.vectors` attribute. This part of the process is +handled by the [StaticVectors](/api/architectures#StaticVectors) layer. + + + +#### Creating a custom embedding layer {#custom-embedding-layer} + +The [MultiHashEmbed](/api/architectures#StaticVectors) layer is spaCy's +recommended strategy for constructing initial word representations for your +neural network models, but you can also implement your own. You can register any +function to a string name, and then reference that function within your config +(see the [training docs](/usage/training) for more details). To try this out, +you can save the following little example to a new Python file: + +```python +from spacy.ml.staticvectors import StaticVectors +from spacy.util import registry + +print("I was imported!") + +@registry.architectures("my_example.MyEmbedding.v1") +def MyEmbedding(output_width: int) -> Model[List[Doc], List[Floats2d]]: + print("I was called!") + return StaticVectors(nO=output_width) +``` + +If you pass the path to your file to the [`spacy train`](/api/cli#train) command +using the `--code` argument, your file will be imported, which means the +decorator registering the function will be run. Your function is now on equal +footing with any of spaCy's built-ins, so you can drop it in instead of any +other model with the same input and output signature. For instance, you could +use it in the tagger model as follows: + +```ini +[tagger.model.tok2vec.embed] +@architectures = "my_example.MyEmbedding.v1" +output_width = 128 +``` + +Now that you have a custom function wired into the network, you can start +implementing the logic you're interested in. For example, let's say you want to +try a relatively simple embedding strategy that makes use of static word +vectors, but combines them via summation with a smaller table of learned +embeddings. + +```python +from thinc.api import add, chain, remap_ids, Embed +from spacy.ml.staticvectors import StaticVectors + +@registry.architectures("my_example.MyEmbedding.v1") +def MyCustomVectors( + output_width: int, + vector_width: int, + embed_rows: int, + key2row: Dict[int, int] +) -> Model[List[Doc], List[Floats2d]]: + return add( + StaticVectors(nO=output_width), + chain( + FeatureExtractor(["ORTH"]), + remap_ids(key2row), + Embed(nO=output_width, nV=embed_rows) + ) + ) +``` + +## Pretraining {#pretraining} + + diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index ac922c4fa..325063e58 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -9,6 +9,7 @@ menu: - ['Tokenization', 'tokenization'] - ['Merging & Splitting', 'retokenization'] - ['Sentence Segmentation', 'sbd'] + - ['Vectors & Similarity', 'vectors-similarity'] - ['Language data', 'language-data'] --- @@ -1024,10 +1025,10 @@ produced by the tokenizer. > > If you're working with transformer models like BERT, check out the > [`spacy-transformers`](https://github.com/explosion/spacy-transformers) -> extension package and [documentation](/usage/transformers). It includes a -> pipeline component for using pretrained transformer weights and **training -> transformer models** in spaCy, as well as helpful utilities for aligning word -> pieces to linguistic tokenization. +> extension package and [documentation](/usage/embeddings-transformers). It +> includes a pipeline component for using pretrained transformer weights and +> **training transformer models** in spaCy, as well as helpful utilities for +> aligning word pieces to linguistic tokenization. ```python ### Custom BERT word piece tokenizer @@ -1510,7 +1511,7 @@ adding it to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe). Here's an example of a component that implements a pre-processing rule for -splitting on `'...'` tokens. The component is added before the parser, which is +splitting on `"..."` tokens. The component is added before the parser, which is then used to further segment the text. That's possible, because `is_sent_start` is only set to `True` for some of the tokens – all others still specify `None` for unset sentence boundaries. This approach can be useful if you want to @@ -1540,6 +1541,152 @@ doc = nlp(text) print("After:", [sent.text for sent in doc.sents]) ``` +## Word vectors and semantic similarity {#vectors-similarity} + +import Vectors101 from 'usage/101/\_vectors-similarity.md' + + + + + +Computing similarity scores can be helpful in many situations, but it's also +important to maintain **realistic expectations** about what information it can +provide. Words can be related to each over in many ways, so a single +"similarity" score will always be a **mix of different signals**, and vectors +trained on different data can produce very different results that may not be +useful for your purpose. + +Also note that the similarity of `Doc` or `Span` objects defaults to the +**average** of the token vectors. This means it's insensitive to the order of +the words. Two documents expressing the same meaning with dissimilar wording +will return a lower similarity score than two documents that happen to contain +the same words while expressing different meanings. + + + +### Adding word vectors {#adding-vectors} + +Custom word vectors can be trained using a number of open-source libraries, such +as [Gensim](https://radimrehurek.com/gensim), [Fast Text](https://fasttext.cc), +or Tomas Mikolov's original +[Word2vec implementation](https://code.google.com/archive/p/word2vec/). Most +word vector libraries output an easy-to-read text-based format, where each line +consists of the word followed by its vector. For everyday use, we want to +convert the vectors model into a binary format that loads faster and takes up +less space on disk. The easiest way to do this is the +[`init model`](/api/cli#init-model) command-line utility. This will output a +spaCy model in the directory `/tmp/la_vectors_wiki_lg`, giving you access to +some nice Latin vectors. You can then pass the directory path to +[`spacy.load`](/api/top-level#spacy.load). + +> #### Usage example +> +> ```python +> nlp_latin = spacy.load("/tmp/la_vectors_wiki_lg") +> doc1 = nlp_latin("Caecilius est in horto") +> doc2 = nlp_latin("servus est in atrio") +> doc1.similarity(doc2) +> ``` + +```bash +wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz +python -m spacy init model en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.vec.gz +``` + + + +To help you strike a good balance between coverage and memory usage, spaCy's +[`Vectors`](/api/vectors) class lets you map **multiple keys** to the **same +row** of the table. If you're using the +[`spacy init model`](/api/cli#init-model) command to create a vocabulary, +pruning the vectors will be taken care of automatically if you set the +`--prune-vectors` flag. You can also do it manually in the following steps: + +1. Start with a **word vectors model** that covers a huge vocabulary. For + instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg) + model provides 300-dimensional GloVe vectors for over 1 million terms of + English. +2. If your vocabulary has values set for the `Lexeme.prob` attribute, the + lexemes will be sorted by descending probability to determine which vectors + to prune. Otherwise, lexemes will be sorted by their order in the `Vocab`. +3. Call [`Vocab.prune_vectors`](/api/vocab#prune_vectors) with the number of + vectors you want to keep. + +```python +nlp = spacy.load('en_vectors_web_lg') +n_vectors = 105000 # number of vectors to keep +removed_words = nlp.vocab.prune_vectors(n_vectors) + +assert len(nlp.vocab.vectors) <= n_vectors # unique vectors have been pruned +assert nlp.vocab.vectors.n_keys > n_vectors # but not the total entries +``` + +[`Vocab.prune_vectors`](/api/vocab#prune_vectors) reduces the current vector +table to a given number of unique entries, and returns a dictionary containing +the removed words, mapped to `(string, score)` tuples, where `string` is the +entry the removed word was mapped to, and `score` the similarity score between +the two words. + +```python +### Removed words +{ + "Shore": ("coast", 0.732257), + "Precautionary": ("caution", 0.490973), + "hopelessness": ("sadness", 0.742366), + "Continous": ("continuous", 0.732549), + "Disemboweled": ("corpse", 0.499432), + "biostatistician": ("scientist", 0.339724), + "somewheres": ("somewheres", 0.402736), + "observing": ("observe", 0.823096), + "Leaving": ("leaving", 1.0), +} +``` + +In the example above, the vector for "Shore" was removed and remapped to the +vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to +the vector of "leaving", which is identical. If you're using the +[`init model`](/api/cli#init-model) command, you can set the `--prune-vectors` +option to easily reduce the size of the vectors as you add them to a spaCy +model: + +```bash +$ python -m spacy init model /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000 +``` + +This will create a spaCy model with vectors for the first 10,000 words in the +vectors model. All other words in the vectors model are mapped to the closest +vector among those retained. + + + +### Adding vectors individually {#adding-individual-vectors} + +The `vector` attribute is a **read-only** numpy or cupy array (depending on +whether you've configured spaCy to use GPU memory), with dtype `float32`. The +array is read-only so that spaCy can avoid unnecessary copy operations where +possible. You can modify the vectors via the [`Vocab`](/api/vocab) or +[`Vectors`](/api/vectors) table. Using the +[`Vocab.set_vector`](/api/vocab#set_vector) method is often the easiest approach +if you have vectors in an arbitrary format, as you can read in the vectors with +your own logic, and just set them with a simple loop. This method is likely to +be slower than approaches that work with the whole vectors table at once, but +it's a great approach for once-off conversions before you save out your model to +disk. + +```python +### Adding vectors +from spacy.vocab import Vocab + +vector_data = { + "dog": numpy.random.uniform(-1, 1, (300,)), + "cat": numpy.random.uniform(-1, 1, (300,)), + "orange": numpy.random.uniform(-1, 1, (300,)) +} +vocab = Vocab() +for word, vector in vector_data.items(): + vocab.set_vector(word, vector) +``` + ## Language data {#language-data} import LanguageData101 from 'usage/101/\_language-data.md' diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 8df4b200d..2b040a832 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1,6 +1,6 @@ --- title: Language Processing Pipelines -next: /usage/vectors-embeddings +next: /usage/embeddings-transformers menu: - ['Processing Text', 'processing'] - ['How Pipelines Work', 'pipelines'] @@ -324,9 +324,9 @@ pretrained components and new components trained on your data. When reusing components across models, keep in mind that the **vocabulary**, **vectors** and model settings **must match**. If a pretrained model includes -[word vectors](/usage/vectors-embeddings) and the component uses them as -features, the model you copy it to needs to have the _same_ vectors available – -otherwise, it won't be able to make the same predictions. +[word vectors](/usage/linguistic-features#vectors-similarity) and the component +uses them as features, the model you copy it to needs to have the _same_ vectors +available – otherwise, it won't be able to make the same predictions. @@ -1202,7 +1202,7 @@ document similarity method. Hooks let you customize some of the behaviors of the `Doc`, `Span` or `Token` objects by adding a component to the pipeline. For instance, to customize the [`Doc.similarity`](/api/doc#similarity) method, you can add a component that -sets a custom function to `doc.user_hooks['similarity']`. The built-in +sets a custom function to `doc.user_hooks["similarity"]`. The built-in `Doc.similarity` method will check the `user_hooks` dict, and delegate to your function if you've set one. Similar results can be achieved by setting functions to `Doc.user_span_hooks` and `Doc.user_token_hooks`. diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index 49cdd96ea..df08e0320 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -247,7 +247,7 @@ import Vectors101 from 'usage/101/\_vectors-similarity.md' To learn more about word vectors, how to **customize them** and how to load **your own vectors** into spaCy, see the usage guide on -[using word vectors and semantic similarities](/usage/vectors-embeddings). +[using word vectors and semantic similarities](/usage/linguistic-features#vectors-similarity). diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index fc1624ec1..9f74cafac 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -30,7 +30,7 @@ ready-to-use spaCy models. -## Quickstart {#quickstart} +## Quickstart {#quickstart tag="new"} The recommended way to train your spaCy models is via the [`spacy train`](/api/cli#train) command on the command line. It only needs a @@ -131,7 +131,7 @@ Some of the main advantages and features of spaCy's training config are: multiple components, define them once and reference them as [variables](#config-interpolation). - **Reproducibility with no hidden defaults.** The config file is the "single - source of truth" and includes all settings. + source of truth" and includes all settings. - **Automated checks and validation.** When you load a config, spaCy checks if the settings are complete and if all values have the correct types. This lets you catch potential mistakes early. In your custom architectures, you can use @@ -667,7 +667,7 @@ visualize your model. For more details on how to integrate transformer models into your training config and customize the implementations, see the usage guide on -[training transformers](/usage/transformers#training). +[training transformers](/usage/embeddings-transformers#transformers-training). ### Pretraining with spaCy {#pretraining} diff --git a/website/docs/usage/v2.md b/website/docs/usage/v2.md index 59a842968..f7bcc17d3 100644 --- a/website/docs/usage/v2.md +++ b/website/docs/usage/v2.md @@ -218,7 +218,7 @@ available via `token.orth`. The new [`Vectors`](/api/vectors) class helps the `Vocab` manage the vectors assigned to strings, and lets you assign vectors individually, or -[load in GloVe vectors](/usage/vectors-embeddings#custom-loading-glove) from a +[load in GloVe vectors](/usage/linguistic-features#adding-vectors) from a directory. To help you strike a good balance between coverage and memory usage, the `Vectors` class lets you map **multiple keys** to the **same row** of the table. If you're using the [`spacy init-model`](/api/cli#init-model) command to diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 7213adf4a..fda5393a4 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -30,7 +30,7 @@ menu: -- **Usage:** [Transformers](/usage/transformers), +- **Usage:** [Embeddings & Transformers](/usage/embeddings-transformers), [Training models](/usage/training) - **API:** [`Transformer`](/api/transformer), [`TransformerData`](/api/transformer#transformerdata), @@ -59,13 +59,13 @@ menu: ### New built-in pipeline components {#features-pipeline-components} -| Name | Description | -| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation. | -| [`Morphologizer`](/api/morphologizer) | Trainable component to predict morphological features. | -| [`Lemmatizer`](/api/lemmatizer) | Standalone component for rule-based and lookup lemmatization. | -| [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. | -| [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). | +| Name | Description | +| ----------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation. | +| [`Morphologizer`](/api/morphologizer) | Trainable component to predict morphological features. | +| [`Lemmatizer`](/api/lemmatizer) | Standalone component for rule-based and lookup lemmatization. | +| [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. | +| [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/embeddings-transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). | diff --git a/website/docs/usage/vectors-embeddings.md b/website/docs/usage/vectors-embeddings.md deleted file mode 100644 index 184436d12..000000000 --- a/website/docs/usage/vectors-embeddings.md +++ /dev/null @@ -1,340 +0,0 @@ ---- -title: Vectors and Embeddings -menu: - - ["What's a Word Vector?", 'whats-a-vector'] - - ['Using Word Vectors', 'usage'] - - ['Converting and Importing', 'converting'] -next: /usage/transformers ---- - -Word vector tables (or "embeddings") let you find similar terms, and can improve -the accuracy of some of your components. You can even use word vectors as a -quick-and-dirty text-classification solution when you don't have any training data. -Word vector tables are included in some of the spaCy [model packages](/models) -we distribute, and you can easily create your own model packages with word -vectors you train or download yourself. - -## What's a word vector? {#whats-a-vector} - -For spaCy's purposes, a "word vector" is a 1-dimensional slice from a -2-dimensional **vectors table**, with a deterministic mapping from word types to -rows in the table. - -```python -def what_is_a_word_vector( - word_id: int, - key2row: Dict[int, int], - vectors_table: Floats2d, - *, - default_row: int=0 -) -> Floats1d: - return vectors_table[key2row.get(word_id, default_row)] -``` - -An old idea in linguistics is that you can "know a word by the company it -keeps": that is, word meanings can be understood relationally, based on their -patterns of usage. This idea inspired a branch of NLP research known as -"distributional semantics" that has aimed to compute databases of lexical -knowledge automatically. The [Word2vec](https://en.wikipedia.org/wiki/Word2vec) -family of algorithms are a key milestone in this line of research. For -simplicity, we will refer to a distributional word representation as a "word -vector", and algorithms that computes word vectors (such as -[GloVe](https://nlp.stanford.edu/projects/glove/), -[FastText](https://fasttext.cc), etc.) as "Word2vec algorithms". - -Word2vec algorithms try to produce vectors tables that let you estimate useful -relationships between words using simple linear algebra operations. For -instance, you can often find close synonyms of a word by finding the vectors -closest to it by cosine distance, and then finding the words that are mapped to -those neighboring vectors. Word vectors can also be useful as features in -statistical models. - -### Word vectors vs. contextual language models {#vectors-vs-language-models} - -The key difference between word vectors and contextual language models such -as [transformers](/usage/transformers) -is that word vectors model **lexical types**, rather than -_tokens_. If you have a list of terms with no context around them, -a transformer model like BERT can't really help you. BERT is designed to understand -language **in context**, which isn't what you have. A word vectors table will be -a much better fit for your task. However, if you do have words in context — whole -sentences or paragraphs of running text — word vectors will only provide a very -rough approximation of what the text is about. - -Word vectors are also very computationally efficient, as they map a word to a -vector with a single indexing operation. Word vectors are therefore useful as a -way to **improve the accuracy** of neural network models, especially models that -are small or have received little or no pretraining. In spaCy, word vector -tables are only used as **static features**. spaCy does not backpropagate -gradients to the pretrained word vectors table. The static vectors table is -usually used in combination with a smaller table of learned task-specific -embeddings. - -## Using word vectors {#usage} - -spaCy stores word vector information in the -[`Vocab.vectors`](/api/vocab#attributes) attribute, so you can access the whole -vectors table from most spaCy objects. You can also access the vector for a -[`Doc`](/api/doc), [`Span`](/api/span), [`Token`](/api/token) or -[`Lexeme`](/api/lexeme) instance via the `vector` attribute. If your `Doc` or -`Span` has multiple tokens, the average of the word vectors will be returned, -excluding any "out of vocabulary" entries that have no vector available. If none -of the words have a vector, a zeroed vector will be returned. - -The `vector` attribute is a **read-only** numpy or cupy array (depending on -whether you've configured spaCy to use GPU memory), with dtype `float32`. The -array is read-only so that spaCy can avoid unnecessary copy operations where -possible. You can modify the vectors via the `Vocab` or `Vectors` table. - -### Word vectors and similarity - -A common use-case of word vectors is to answer _similarity questions_. You can -ask how similar a `token`, `span`, `doc` or `lexeme` is to another object using -the `.similarity()` method. You can even check the similarity of mismatched -types, asking how similar a whole document is to a particular word, how similar -a span is to a document, etc. By default, the `.similarity()` method will use -return the cosine of the `.vector` attribute of the two objects being compared. -You can customize this behavior by setting one or more -[user hooks](/usage/processing-pipelines#custom-components-user-hooks) for the -types you want to customize. - -Word vector similarity is a practical technique for many situations, especially -since it's easy to use and relatively efficient to compute. However, it's -important to maintain realistic expectations about what information it can -provide. Words can be related to each over in many ways, so a single -"similarity" score will always be a mix of different signals. The word vectors -model is also not trained for your specific use-case, so you have no way of -telling it which results are more or less useful for your purpose. These -problems are even more accute when you go from measuring the similarity of -single words to the similarity of spans or documents. The vector averaging -process is insensitive to the order of the words, so `doc1.similarity(doc2)` -will mostly be based on the overlap in lexical items between the two documents -objects. Two documents expressing the same meaning with dissimilar wording will -return a lower similarity score than two documents that happen to contain the -same words while expressing different meanings. - -### Using word vectors in your models - -Many neural network models are able to use word vector tables as additional -features, which sometimes results in significant improvements in accuracy. -spaCy's built-in embedding layer, `spacy.MultiHashEmbed.v1`, can be configured -to use word vector tables using the `also_use_static_vectors` flag. This -setting is also available on the `spacy.MultiHashEmbedCNN.v1` layer, which -builds the default token-to-vector encoding architecture. - -``` -[tagger.model.tok2vec.embed] -@architectures = "spacy.MultiHashEmbed.v1" -width = 128 -rows = 7000 -also_embed_subwords = true -also_use_static_vectors = true -``` - - -The configuration system will look up the string `spacy.MultiHashEmbed.v1` -in the `architectures` registry, and call the returned object with the -rest of the arguments from the block. This will result in a call to the -`spacy.ml.models.tok2vec.MultiHashEmbed` function, which will return -a Thinc model object with the type signature `Model[List[Doc], -List[Floats2d]]`. Because the embedding layer takes a list of `Doc` objects as -input, it does not need to store a copy of the vectors table. The vectors will -be retrieved from the `Doc` objects that are passed in, via the -`doc.vocab.vectors` attribute. This part of the process is handled by the -`spacy.ml.staticvectors.StaticVectors` layer. - - -#### Creating a custom embedding layer - -The `MultiHashEmbed` layer is spaCy's recommended strategy for constructing -initial word representations for your neural network models, but you can also -implement your own. You can register any function to a string name, and then -reference that function within your config (see the [training]("/usage/training") -section for more details). To try this out, you can save the following little -example to a new Python file: - -``` -from spacy.ml.staticvectors import StaticVectors -from spacy.util import registry - -print("I was imported!") - -@registry.architectures("my_example.MyEmbedding.v1") -def MyEmbedding(output_width: int) -> Model[List[Doc], List[Floats2d]]: - print("I was called!") - return StaticVectors(nO=output_width) -``` - -If you pass the path to your file to the `spacy train` command using the `-c` -argument, your file will be imported, which means the decorator registering the -function will be run. Your function is now on equal footing with any of spaCy's -built-ins, so you can drop it in instead of any other model with the same input -and output signature. For instance, you could use it in the tagger model as -follows: - -``` -[tagger.model.tok2vec.embed] -@architectures = "my_example.MyEmbedding.v1" -output_width = 128 -``` - -Now that you have a custom function wired into the network, you can start -implementing the logic you're interested in. For example, let's say you want to -try a relatively simple embedding strategy that makes use of static word vectors, -but combines them via summation with a smaller table of learned embeddings. - -```python -from thinc.api import add, chain, remap_ids, Embed -from spacy.ml.staticvectors import StaticVectors - -@registry.architectures("my_example.MyEmbedding.v1") -def MyCustomVectors( - output_width: int, - vector_width: int, - embed_rows: int, - key2row: Dict[int, int] -) -> Model[List[Doc], List[Floats2d]]: - return add( - StaticVectors(nO=output_width), - chain( - FeatureExtractor(["ORTH"]), - remap_ids(key2row), - Embed(nO=output_width, nV=embed_rows) - ) - ) -``` - -#### When should you add word vectors to your model? - -Word vectors are not compatible with most [transformer models](/usage/transformers), -but if you're training another type of NLP network, it's almost always worth -adding word vectors to your model. As well as improving your final accuracy, -word vectors often make experiments more consistent, as the accuracy you -reach will be less sensitive to how the network is randomly initialized. High -variance due to random chance can slow down your progress significantly, as you -need to run many experiments to filter the signal from the noise. - -Word vector features need to be enabled prior to training, and the same word vectors -table will need to be available at runtime as well. You cannot add word vector -features once the model has already been trained, and you usually cannot -replace one word vectors table with another without causing a significant loss -of performance. - -## Converting word vectors for use in spaCy {#converting} - -Custom word vectors can be trained using a number of open-source libraries, such -as [Gensim](https://radimrehurek.com/gensim), [Fast Text](https://fasttext.cc), -or Tomas Mikolov's original -[Word2vec implementation](https://code.google.com/archive/p/word2vec/). Most -word vector libraries output an easy-to-read text-based format, where each line -consists of the word followed by its vector. For everyday use, we want to -convert the vectors model into a binary format that loads faster and takes up -less space on disk. The easiest way to do this is the -[`init-model`](/api/cli#init-model) command-line utility: - -```bash -wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz -python -m spacy init-model en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.vec.gz -``` - -This will output a spaCy model in the directory `/tmp/la_vectors_wiki_lg`, -giving you access to some nice Latin vectors 😉 You can then pass the directory -path to [`spacy.load()`](/api/top-level#spacy.load). - -```python -nlp_latin = spacy.load("/tmp/la_vectors_wiki_lg") -doc1 = nlp_latin("Caecilius est in horto") -doc2 = nlp_latin("servus est in atrio") -doc1.similarity(doc2) -``` - -The model directory will have a `/vocab` directory with the strings, lexical -entries and word vectors from the input vectors model. The -[`init-model`](/api/cli#init-model) command supports a number of archive formats -for the word vectors: the vectors can be in plain text (`.txt`), zipped -(`.zip`), or tarred and zipped (`.tgz`). - -### Optimizing vector coverage {#custom-vectors-coverage new="2"} - -To help you strike a good balance between coverage and memory usage, spaCy's -[`Vectors`](/api/vectors) class lets you map **multiple keys** to the **same -row** of the table. If you're using the -[`spacy init-model`](/api/cli#init-model) command to create a vocabulary, -pruning the vectors will be taken care of automatically if you set the -`--prune-vectors` flag. You can also do it manually in the following steps: - -1. Start with a **word vectors model** that covers a huge vocabulary. For - instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg) - model provides 300-dimensional GloVe vectors for over 1 million terms of - English. -2. If your vocabulary has values set for the `Lexeme.prob` attribute, the - lexemes will be sorted by descending probability to determine which vectors - to prune. Otherwise, lexemes will be sorted by their order in the `Vocab`. -3. Call [`Vocab.prune_vectors`](/api/vocab#prune_vectors) with the number of - vectors you want to keep. - -```python -nlp = spacy.load('en_vectors_web_lg') -n_vectors = 105000 # number of vectors to keep -removed_words = nlp.vocab.prune_vectors(n_vectors) - -assert len(nlp.vocab.vectors) <= n_vectors # unique vectors have been pruned -assert nlp.vocab.vectors.n_keys > n_vectors # but not the total entries -``` - -[`Vocab.prune_vectors`](/api/vocab#prune_vectors) reduces the current vector -table to a given number of unique entries, and returns a dictionary containing -the removed words, mapped to `(string, score)` tuples, where `string` is the -entry the removed word was mapped to, and `score` the similarity score between -the two words. - -```python -### Removed words -{ - "Shore": ("coast", 0.732257), - "Precautionary": ("caution", 0.490973), - "hopelessness": ("sadness", 0.742366), - "Continous": ("continuous", 0.732549), - "Disemboweled": ("corpse", 0.499432), - "biostatistician": ("scientist", 0.339724), - "somewheres": ("somewheres", 0.402736), - "observing": ("observe", 0.823096), - "Leaving": ("leaving", 1.0), -} -``` - -In the example above, the vector for "Shore" was removed and remapped to the -vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to -the vector of "leaving", which is identical. If you're using the -[`init-model`](/api/cli#init-model) command, you can set the `--prune-vectors` -option to easily reduce the size of the vectors as you add them to a spaCy -model: - -```bash -$ python -m spacy init-model /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000 -``` - -This will create a spaCy model with vectors for the first 10,000 words in the -vectors model. All other words in the vectors model are mapped to the closest -vector among those retained. - -### Adding vectors {#adding-vectors} - -You can also add word vectors individually, using the method `vocab.set_vector`. -This is often the easiest approach if you have vectors in an arbitrary format, -as you can read in the vectors with your own logic, and just set them with -a simple loop. This method is likely to be slower than approaches that work -with the whole vectors table at once, but it's a great approach for once-off -conversions before you save out your model to disk. - -```python -### Adding vectors -from spacy.vocab import Vocab - -vector_data = {"dog": numpy.random.uniform(-1, 1, (300,)), - "cat": numpy.random.uniform(-1, 1, (300,)), - "orange": numpy.random.uniform(-1, 1, (300,))} -vocab = Vocab() -for word, vector in vector_data.items(): - vocab.set_vector(word, vector) -``` diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 6f8763955..c830619c5 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -18,8 +18,11 @@ { "text": "Linguistic Features", "url": "/usage/linguistic-features" }, { "text": "Rule-based Matching", "url": "/usage/rule-based-matching" }, { "text": "Processing Pipelines", "url": "/usage/processing-pipelines" }, - { "text": "Vectors & Embeddings", "url": "/usage/vectors-embeddings" }, - { "text": "Transformers", "url": "/usage/transformers", "tag": "new" }, + { + "text": "Embeddings & Transformers", + "url": "/usage/embeddings-transformers", + "tag": "new" + }, { "text": "Training Models", "url": "/usage/training", "tag": "new" }, { "text": "spaCy Projects", "url": "/usage/projects", "tag": "new" }, { "text": "Saving & Loading", "url": "/usage/saving-loading" }, diff --git a/website/src/components/typography.js b/website/src/components/typography.js index 41464473f..d37c345b9 100644 --- a/website/src/components/typography.js +++ b/website/src/components/typography.js @@ -9,7 +9,12 @@ import { isString, github, headingTextClassName } from './util' import classes from '../styles/typography.module.sass' export const H1 = ({ Component = 'h1', className, ...props }) => ( - + ) export const H2 = ({ className, ...props }) => ( @@ -90,6 +95,7 @@ const Headline = ({ source, hidden, action, + permalink = true, className, children, }) => { @@ -102,7 +108,7 @@ const Headline = ({ const tags = tag ? tag.split(',').map(t => t.trim()) : [] return ( - {children} + {children} {tags.map((tag, i) => ( {tag}