From 386dcada1cddecc13c499016a3cf5585a2dec088 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 26 Oct 2021 16:53:10 +0200
Subject: [PATCH 1/6] Address random results in slow readers tests (#9544)

* Set random seed for dataset shuffling
* Use more dev examples for non-zero scores
---
 spacy/tests/training/test_readers.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
index c0c51b287..8c5c81625 100644
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@@ -1,6 +1,6 @@
 from typing import Dict, Iterable, Callable
 import pytest
-from thinc.api import Config
+from thinc.api import Config, fix_random_seed
 from spacy import Language
 from spacy.util import load_model_from_config, registry, resolve_dot_names
 from spacy.schemas import ConfigSchemaTraining
@@ -64,8 +64,8 @@ def test_readers():
 @pytest.mark.parametrize(
     "reader,additional_config",
     [
-        ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 2}),
-        ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 2}),
+        ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 10}),
+        ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 10}),
         ("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}),
     ],
 )
@@ -93,6 +93,7 @@ def test_cat_readers(reader, additional_config):
     factory = "textcat_multilabel"
     """
     config = Config().from_str(nlp_config_string)
+    fix_random_seed(config["training"]["seed"])
     config["corpora"]["@readers"] = reader
     config["corpora"].update(additional_config)
     nlp = load_model_from_config(config, auto_fill=True)

From 72dc63b3fb04e472ae000a71e1125a4950e186d8 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 28 Oct 2021 15:32:06 +0200
Subject: [PATCH 2/6] Update for python 3.10 (#9519)

* Update for python 3.10

* Update mac image

* Update build constraints for python 3.10

* Add extras for cupy cuda 11.3-11.5

* Remove cupy-cuda115 extra

* Require thinc>=8.0.12

* Switch CI to windows-2019

* Skip mypy for python 3.10
---
 .github/azure-steps.yml |  1 +
 azure-pipelines.yml     | 27 ++++++++++++++++++---------
 build-constraints.txt   |  3 ++-
 pyproject.toml          |  2 +-
 requirements.txt        |  2 +-
 setup.cfg               |  9 +++++++--
 6 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index 8501b2abe..80c88b0b8 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -27,6 +27,7 @@ steps:
 
   - script: python -m mypy spacy
     displayName: 'Run mypy'
+    condition: ne(variables['python_version'], '3.10')
 
   - task: DeleteFiles@1
     inputs:
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 6bf591bee..4291b6e0a 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -42,7 +42,7 @@ jobs:
           imageName: "ubuntu-18.04"
           python.version: "3.6"
         #        Python36Windows:
-        #          imageName: "vs2017-win2016"
+        #          imageName: "windows-2019"
         #          python.version: "3.6"
         #        Python36Mac:
         #          imageName: "macos-10.14"
@@ -51,7 +51,7 @@ jobs:
         #          imageName: "ubuntu-18.04"
         #          python.version: "3.7"
         Python37Windows:
-          imageName: "vs2017-win2016"
+          imageName: "windows-2019"
           python.version: "3.7"
         #        Python37Mac:
         #          imageName: "macos-10.14"
@@ -60,7 +60,7 @@ jobs:
         #          imageName: "ubuntu-18.04"
         #          python.version: "3.8"
         #        Python38Windows:
-        #          imageName: "vs2017-win2016"
+        #          imageName: "windows-2019"
         #          python.version: "3.8"
         Python38Mac:
           imageName: "macos-10.14"
@@ -68,12 +68,21 @@ jobs:
         Python39Linux:
           imageName: "ubuntu-18.04"
           python.version: "3.9"
-        Python39Windows:
-          imageName: "vs2017-win2016"
-          python.version: "3.9"
-        Python39Mac:
-          imageName: "macos-10.14"
-          python.version: "3.9"
+        #        Python39Windows:
+        #          imageName: "windows-2019"
+        #          python.version: "3.9"
+        #        Python39Mac:
+        #          imageName: "macos-10.14"
+        #          python.version: "3.9"
+        Python310Linux:
+          imageName: "ubuntu-20.04"
+          python.version: "3.10"
+        Python310Windows:
+          imageName: "windows-2019"
+          python.version: "3.10"
+        Python310Mac:
+          imageName: "macos-10.15"
+          python.version: "3.10"
       maxParallel: 4
     pool:
       vmImage: $(imageName)
diff --git a/build-constraints.txt b/build-constraints.txt
index 23e660096..cf5fe3284 100644
--- a/build-constraints.txt
+++ b/build-constraints.txt
@@ -2,4 +2,5 @@
 numpy==1.15.0; python_version<='3.7'
 numpy==1.17.3; python_version=='3.8'
 numpy==1.19.3; python_version=='3.9'
-numpy; python_version>='3.10'
+numpy==1.21.3; python_version=='3.10'
+numpy; python_version>='3.11'
diff --git a/pyproject.toml b/pyproject.toml
index cb103de0a..f81484d43 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.11,<8.1.0",
+    "thinc>=8.0.12,<8.1.0",
     "blis>=0.4.0,<0.8.0",
     "pathy",
     "numpy>=1.15.0",
diff --git a/requirements.txt b/requirements.txt
index 9bc39e323..36cf5c58e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@
 spacy-legacy>=3.0.8,<3.1.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.11,<8.1.0
+thinc>=8.0.12,<8.1.0
 blis>=0.4.0,<0.8.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index b2b7e6be3..e5b03afe6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -21,6 +21,7 @@ classifiers =
     Programming Language :: Python :: 3.7
     Programming Language :: Python :: 3.8
     Programming Language :: Python :: 3.9
+    Programming Language :: Python :: 3.10
     Topic :: Scientific/Engineering
 project_urls =
     Release notes = https://github.com/explosion/spaCy/releases
@@ -37,14 +38,14 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.11,<8.1.0
+    thinc>=8.0.12,<8.1.0
 install_requires =
     # Our libraries
     spacy-legacy>=3.0.8,<3.1.0
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.0.11,<8.1.0
+    thinc>=8.0.12,<8.1.0
     blis>=0.4.0,<0.8.0
     wasabi>=0.8.1,<1.1.0
     srsly>=2.4.1,<3.0.0
@@ -95,6 +96,10 @@ cuda111 =
     cupy-cuda111>=5.0.0b4,<10.0.0
 cuda112 =
     cupy-cuda112>=5.0.0b4,<10.0.0
+cuda113 =
+    cupy-cuda113>=5.0.0b4,<10.0.0
+cuda114 =
+    cupy-cuda114>=5.0.0b4,<10.0.0
 # Language tokenizers with external dependencies
 ja =
     sudachipy>=0.4.9

From 76173b0866d4f5ebd8fbcf941a1af606de1bc16f Mon Sep 17 00:00:00 2001
From: Philip Vollet <ppvollet@gmail.com>
Date: Fri, 29 Oct 2021 06:57:44 +0200
Subject: [PATCH 3/6] fixed typo and URL (#9560)

---
 website/meta/universe.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index df8077419..80608c77d 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1138,7 +1138,7 @@
         {
             "id": "deplacy",
             "slogan": "CUI-based Tree Visualizer for Universal Dependencies and Immediate Catena Analysis",
-            "discreption": "Simple dependency visualizer for [spaCy](https://spacy.io/), [UniDic2UD](https://pypi.org/project/unidic2ud), [Stanza](https://stanfordnlp.github.io/stanza/), [NLP-Cube](https://github.com/Adobe/NLP-Cube), [Trankit](https://github.com/nlp-uoregon/trankit), etc.",
+            "description": "Simple dependency visualizer for [spaCy](https://spacy.io/), [UniDic2UD](https://pypi.org/project/unidic2ud), [Stanza](https://stanfordnlp.github.io/stanza/), [NLP-Cube](https://github.com/Adobe/NLP-Cube), [Trankit](https://github.com/nlp-uoregon/trankit), etc.",
             "github": "KoichiYasuoka/deplacy",
             "image": "https://i.imgur.com/6uOI4Op.png",
             "code_example": [
@@ -1270,7 +1270,7 @@
             "description": "`textacy` is a Python library for performing a variety of natural language processing (NLP) tasks, built on the high-performance `spacy` library. With the fundamentals – tokenization, part-of-speech tagging, dependency parsing, etc. – delegated to another library, `textacy` focuses on the tasks that come before and follow after.",
             "github": "chartbeat-labs/textacy",
             "pip": "textacy",
-            "url": "https://chartbeat-labs.github.io/textacy/",
+            "url": "https://github.com/chartbeat-labs/textacy",
             "author": "Burton DeWilde",
             "author_links": {
                 "github": "bdewilde",

From 5477453ea374a88c7ef9bffb9dd0496035a9baa8 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 29 Oct 2021 10:35:31 +0200
Subject: [PATCH 4/6] Docs for thinc-apple-ops (#9549)

* Docs for thinc-apple-ops

* Ignore thinc-apple-ops in reqs tests

* Fix install quickstart

* Add cupy cuda 113, 114 extras

* Remove draft section

Co-authored-by: Ines Montani <ines@ines.io>
---
 setup.cfg                                 |  2 ++
 spacy/tests/package/test_requirements.py  |  1 +
 website/docs/usage/index.md               | 15 ++++++++-------
 website/src/widgets/quickstart-install.js | 22 +++++++++++++++++++++-
 4 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index e5b03afe6..dc31228e5 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -100,6 +100,8 @@ cuda113 =
     cupy-cuda113>=5.0.0b4,<10.0.0
 cuda114 =
     cupy-cuda114>=5.0.0b4,<10.0.0
+apple =
+    thinc-apple-ops>=0.0.4,<1.0.0
 # Language tokenizers with external dependencies
 ja =
     sudachipy>=0.4.9
diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py
index 1d51bd609..75908df59 100644
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@@ -25,6 +25,7 @@ def test_build_dependencies():
         "sudachipy",
         "sudachidict_core",
         "spacy-pkuseg",
+        "thinc-apple-ops",
     ]
 
     # check requirements.txt
diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md
index 707dd3215..54ab62467 100644
--- a/website/docs/usage/index.md
+++ b/website/docs/usage/index.md
@@ -71,13 +71,14 @@ spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included.
 > $ pip install %%SPACY_PKG_NAME[lookups,transformers]%%SPACY_PKG_FLAGS
 > ```
 
-| Name                   | Description                                                                                                                                                                                                                                                    |
-| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lookups`              | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
-| `transformers`         | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline.                                                                                    |
-| `ray`                  | Install [`spacy-ray`](https://github.com/explosion/spacy-ray) to add CLI commands for [parallel training](/usage/training#parallel-training).                                                                                                                  |
-| `cuda`, ...            | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options.                                                                                |
-| `ja`, `ko`, `th`, `zh` | Install additional dependencies required for tokenization for the [languages](/usage/models#languages).                                                                                                                                                        |
+| Name             | Description                                                                                                                                                                                                                                                    |
+| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `lookups`        | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
+| `transformers`   | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline.                                                                                    |
+| `ray`            | Install [`spacy-ray`](https://github.com/explosion/spacy-ray) to add CLI commands for [parallel training](/usage/training#parallel-training).                                                                                                                  |
+| `cuda`, ...      | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options.                                                                                |
+| `apple`          | Install [`thinc-apple-ops`](https://github.com/explosion/thinc-apple-ops) to improve performance on an Apple M1.                                                                                                                                               |
+| `ja`, `ko`, `th` | Install additional dependencies required for tokenization for the [languages](/usage/models#languages).                                                                                                                                                        |
 
 ### conda {#conda}
 
diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js
index 8ed602b72..628e1c533 100644
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@@ -4,10 +4,12 @@ import { StaticQuery, graphql } from 'gatsby'
 import { Quickstart, QS } from '../components/quickstart'
 import { repo, DEFAULT_BRANCH } from '../components/util'
 
+const DEFAULT_OS = 'mac'
+const DEFAULT_PLATFORM = 'x86'
 const DEFAULT_MODELS = ['en']
 const DEFAULT_OPT = 'efficiency'
 const DEFAULT_HARDWARE = 'cpu'
-const DEFAULT_CUDA = 'cuda102'
+const DEFAULT_CUDA = 'cuda113'
 const CUDA = {
     '8.0': 'cuda80',
     '9.0': 'cuda90',
@@ -19,11 +21,15 @@ const CUDA = {
     '11.0': 'cuda110',
     '11.1': 'cuda111',
     '11.2': 'cuda112',
+    '11.3': 'cuda113',
+    '11.4': 'cuda114',
 }
 const LANG_EXTRAS = ['ja'] // only for languages with models
 
 const QuickstartInstall = ({ id, title }) => {
     const [train, setTrain] = useState(false)
+    const [platform, setPlatform] = useState(DEFAULT_PLATFORM)
+    const [os, setOs] = useState(DEFAULT_OS)
     const [hardware, setHardware] = useState(DEFAULT_HARDWARE)
     const [cuda, setCuda] = useState(DEFAULT_CUDA)
     const [selectedModels, setModels] = useState(DEFAULT_MODELS)
@@ -33,15 +39,19 @@ const QuickstartInstall = ({ id, title }) => {
         config: v => setTrain(v.includes('train')),
         models: setModels,
         optimize: v => setEfficiency(v.includes('efficiency')),
+        platform: v => setPlatform(v[0]),
+        os: v => setOs(v[0]),
     }
     const showDropdown = {
         hardware: () => hardware === 'gpu',
     }
     const modelExtras = train ? selectedModels.filter(m => LANG_EXTRAS.includes(m)) : []
+    const apple = os === 'mac' && platform === 'arm'
     const pipExtras = [
         hardware === 'gpu' && cuda,
         train && 'transformers',
         train && 'lookups',
+        apple && 'apple',
         ...modelExtras,
     ]
         .filter(e => e)
@@ -62,6 +72,16 @@ const QuickstartInstall = ({ id, title }) => {
                             { id: 'windows', title: 'Windows' },
                             { id: 'linux', title: 'Linux' },
                         ],
+                        defaultValue: DEFAULT_OS,
+                    },
+                    {
+                        id: 'platform',
+                        title: 'Platform',
+                        options: [
+                            { id: 'x86', title: 'x86', checked: true },
+                            { id: 'arm', title: 'ARM / M1' },
+                        ],
+                        defaultValue: DEFAULT_PLATFORM,
                     },
                     {
                         id: 'package',

From 2fd8d616e77cd48a60007a4c64ca49d5833c1fee Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Fri, 29 Oct 2021 08:36:34 +0000
Subject: [PATCH 5/6] Add docs section for spacy.cli.train.train (#9545)

* Add section for spacy.cli.train.train

* Add link from training page to train function

* Ensure path in train helper

* Update docs

Co-authored-by: Ines Montani <ines@ines.io>
---
 spacy/cli/train.py             |  8 +++++---
 website/docs/api/cli.md        | 23 +++++++++++++++++++++++
 website/docs/api/top-level.md  | 10 +++++-----
 website/docs/usage/training.md | 30 +++++++++++++++++++++++++++---
 4 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 664fc2aaf..cc22cbba6 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -1,4 +1,4 @@
-from typing import Optional, Dict, Any
+from typing import Optional, Dict, Any, Union
 from pathlib import Path
 from wasabi import msg
 import typer
@@ -46,12 +46,14 @@ def train_cli(
 
 
 def train(
-    config_path: Path,
-    output_path: Optional[Path] = None,
+    config_path: Union[str, Path],
+    output_path: Optional[Union[str, Path]] = None,
     *,
     use_gpu: int = -1,
     overrides: Dict[str, Any] = util.SimpleFrozenDict(),
 ):
+    config_path = util.ensure_path(config_path)
+    output_path = util.ensure_path(output_path)
     # Make sure all files and paths exists if they are needed
     if not config_path or (str(config_path) != "-" and not config_path.exists()):
         msg.fail("Config file not found", config_path, exits=1)
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 268ea0703..a4462af56 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -819,6 +819,29 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id]
 | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
 | **CREATES**       | The final trained pipeline and the best trained pipeline.                                                                                                                                                          |
 
+### Calling the training function from Python {#train-function new="3.2"}
+
+The training CLI exposes a `train` helper function that lets you run the
+training just like `spacy train`. Usually it's easier to use the command line
+directly, but if you need to kick off training from code this is how to do it.
+
+> #### Example
+>
+> ```python
+> from spacy.cli.train import train
+>
+> train("./config.cfg", overrides={"paths.train": "./train.spacy", "paths.dev": "./dev.spacy"})
+>
+> ```
+
+| Name           | Description                                                                                                                   |
+| -------------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| `config_path`  | Path to the config to use for training. ~~Union[str, Path]~~                                                                  |
+| `output_path`  | Optional name of directory to save output model in. If not provided a model will not be saved. ~~Optional[Union[str, Path]]~~ |
+| _keyword-only_ |                                                                                                                               |
+| `use_gpu`      | Which GPU to use. Defaults to -1 for no GPU. ~~int~~                                                                          |
+| `overrides`    | Values to override config settings. ~~Dict[str, Any]~~                                                                        |
+
 ## pretrain {#pretrain new="2.1" tag="command,experimental"}
 
 Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index f6910bd5b..c78a1de03 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -826,17 +826,17 @@ from the specified model. Intended for use in `[initialize.before_init]`.
 > after_pipeline_creation = {"@callbacks":"spacy.models_with_nvtx_range.v1"}
 > ```
 
-Recursively wrap the models in each pipe using [NVTX](https://nvidia.github.io/NVTX/)
-range markers. These markers aid in GPU profiling by attributing specific operations
-to a ~~Model~~'s forward or backprop passes.
+Recursively wrap the models in each pipe using
+[NVTX](https://nvidia.github.io/NVTX/) range markers. These markers aid in GPU
+profiling by attributing specific operations to a ~~Model~~'s forward or
+backprop passes.
 
 | Name             | Description                                                                                                                  |
-|------------------|------------------------------------------------------------------------------------------------------------------------------|
+| ---------------- | ---------------------------------------------------------------------------------------------------------------------------- |
 | `forward_color`  | Color identifier for forward passes. Defaults to `-1`. ~~int~~                                                               |
 | `backprop_color` | Color identifier for backpropagation passes. Defaults to `-1`. ~~int~~                                                       |
 | **CREATES**      | A function that takes the current `nlp` and wraps forward/backprop passes in NVTX ranges. ~~Callable[[Language], Language]~~ |
 
-
 ## Training data and alignment {#gold source="spacy/training"}
 
 ### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 94fdad209..bd5ea7751 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -301,8 +301,6 @@ fly without having to save to and load from disk.
 $ python -m spacy init config - --lang en --pipeline ner,textcat --optimize accuracy | python -m spacy train - --paths.train ./corpus/train.spacy --paths.dev ./corpus/dev.spacy
 ```
 
-<!-- TODO: add reference to Prodigy's commands once Prodigy nightly is available -->
-
 ### Using variable interpolation {#config-interpolation}
 
 Another very useful feature of the config system is that it supports variable
@@ -1647,7 +1645,7 @@ workers are stuck waiting for it to complete before they can continue.
 
 ## Internal training API {#api}
 
-<Infobox variant="warning">
+<Infobox variant="danger">
 
 spaCy gives you full control over the training loop. However, for most use
 cases, it's recommended to train your pipelines via the
@@ -1659,6 +1657,32 @@ typically give you everything you need to train fully custom pipelines with
 
 </Infobox>
 
+### Training from a Python script {#api-train new="3.2"}
+
+If you want to run the training from a Python script instead of using the
+[`spacy train`](/api/cli#train) CLI command, you can call into the
+[`train`](/api/cli#train-function) helper function directly. It takes the path
+to the config file, an optional output directory and an optional dictionary of
+[config overrides](#config-overrides).
+
+```python
+from spacy.cli.train import train
+
+train("./config.cfg", overrides={"paths.train": "./train.spacy", "paths.dev": "./dev.spacy"})
+```
+
+### Internal training loop API {#api-loop}
+
+<Infobox variant="warning">
+
+This section documents how the training loop and updates to the `nlp` object
+work internally. You typically shouldn't have to implement this in Python unless
+you're writing your own trainable components. To train a pipeline, use
+[`spacy train`](/api/cli#train) or the [`train`](/api/cli#train-function) helper
+function instead.
+
+</Infobox>
+
 The [`Example`](/api/example) object contains annotated training data, also
 called the **gold standard**. It's initialized with a [`Doc`](/api/doc) object
 that will hold the predictions, and another `Doc` object that holds the

From 006df1ae1fe2d745dec19cacb3ca71d06447a7aa Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Fri, 29 Oct 2021 10:08:40 +0000
Subject: [PATCH 6/6] Clarify error when words are of wrong type (#9541)

* Clarify error when words are of wrong type

See #9437

* Update docs

* Use try/except

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/errors.py         |  1 +
 spacy/tokens/doc.pyx    | 16 ++++++++++------
 website/docs/api/doc.md |  2 +-
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index e6912a263..ff1185361 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -877,6 +877,7 @@ class Errors:
              "filename. Specify an epoch to resume from.")
     E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
              "Non-UD tags should use the `tag` property.")
+    E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
 
 
 # Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 5ea3e1e3b..1ee845934 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -194,11 +194,12 @@ cdef class Doc:
 
         vocab (Vocab): A vocabulary object, which must match any models you
             want to use (e.g. tokenizer, parser, entity recognizer).
-        words (Optional[List[str]]): A list of unicode strings to add to the document
-            as words. If `None`, defaults to empty list.
-        spaces (Optional[List[bool]]): A list of boolean values, of the same length as
-            words. True means that the word is followed by a space, False means
-            it is not. If `None`, defaults to `[True]*len(words)`
+        words (Optional[List[Union[str, int]]]): A list of unicode strings or
+            hash values to add to the document as words. If `None`, defaults to
+            empty list.
+        spaces (Optional[List[bool]]): A list of boolean values, of the same
+            length as `words`. `True` means that the word is followed by a space,
+            `False` means it is not. If `None`, defaults to `[True]*len(words)`
         user_data (dict or None): Optional extra data to attach to the Doc.
         tags (Optional[List[str]]): A list of unicode strings, of the same
             length as words, to assign as token.tag. Defaults to None.
@@ -266,7 +267,10 @@ cdef class Doc:
             elif isinstance(word, bytes):
                 raise ValueError(Errors.E028.format(value=word))
             else:
-                lexeme = self.vocab.get_by_orth(self.mem, word)
+                try:
+                    lexeme = self.vocab.get_by_orth(self.mem, word)
+                except TypeError:
+                    raise TypeError(Errors.E1022.format(wtype=type(word)))
             self.push_back(lexeme, has_space)
 
         if heads is not None:
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index e1f18963b..9836b8c21 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -34,7 +34,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 | Name                                     | Description                                                                                                                                                                                        |
 | ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `vocab`                                  | A storage container for lexical types. ~~Vocab~~                                                                                                                                                   |
-| `words`                                  | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                                 |
+| `words`                                  | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~                                                                                                 |
 | `spaces`                                 | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~       |
 | _keyword-only_                           |                                                                                                                                                                                                    |
 | `user\_data`                             | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                                 |