From 386dcada1cddecc13c499016a3cf5585a2dec088 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 26 Oct 2021 16:53:10 +0200 Subject: [PATCH 1/6] Address random results in slow readers tests (#9544) * Set random seed for dataset shuffling * Use more dev examples for non-zero scores --- spacy/tests/training/test_readers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index c0c51b287..8c5c81625 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -1,6 +1,6 @@ from typing import Dict, Iterable, Callable import pytest -from thinc.api import Config +from thinc.api import Config, fix_random_seed from spacy import Language from spacy.util import load_model_from_config, registry, resolve_dot_names from spacy.schemas import ConfigSchemaTraining @@ -64,8 +64,8 @@ def test_readers(): @pytest.mark.parametrize( "reader,additional_config", [ - ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 2}), - ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 2}), + ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 10}), + ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 10}), ("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}), ], ) @@ -93,6 +93,7 @@ def test_cat_readers(reader, additional_config): factory = "textcat_multilabel" """ config = Config().from_str(nlp_config_string) + fix_random_seed(config["training"]["seed"]) config["corpora"]["@readers"] = reader config["corpora"].update(additional_config) nlp = load_model_from_config(config, auto_fill=True) From 72dc63b3fb04e472ae000a71e1125a4950e186d8 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 28 Oct 2021 15:32:06 +0200 Subject: [PATCH 2/6] Update for python 3.10 (#9519) * Update for python 3.10 * Update mac image * Update build constraints for python 3.10 * Add extras for cupy cuda 11.3-11.5 * Remove cupy-cuda115 extra * Require thinc>=8.0.12 * Switch CI to windows-2019 * Skip mypy for python 3.10 --- .github/azure-steps.yml | 1 + azure-pipelines.yml | 27 ++++++++++++++++++--------- build-constraints.txt | 3 ++- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 9 +++++++-- 6 files changed, 30 insertions(+), 14 deletions(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index 8501b2abe..80c88b0b8 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -27,6 +27,7 @@ steps: - script: python -m mypy spacy displayName: 'Run mypy' + condition: ne(variables['python_version'], '3.10') - task: DeleteFiles@1 inputs: diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 6bf591bee..4291b6e0a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -42,7 +42,7 @@ jobs: imageName: "ubuntu-18.04" python.version: "3.6" # Python36Windows: - # imageName: "vs2017-win2016" + # imageName: "windows-2019" # python.version: "3.6" # Python36Mac: # imageName: "macos-10.14" @@ -51,7 +51,7 @@ jobs: # imageName: "ubuntu-18.04" # python.version: "3.7" Python37Windows: - imageName: "vs2017-win2016" + imageName: "windows-2019" python.version: "3.7" # Python37Mac: # imageName: "macos-10.14" @@ -60,7 +60,7 @@ jobs: # imageName: "ubuntu-18.04" # python.version: "3.8" # Python38Windows: - # imageName: "vs2017-win2016" + # imageName: "windows-2019" # python.version: "3.8" Python38Mac: imageName: "macos-10.14" @@ -68,12 +68,21 @@ jobs: Python39Linux: imageName: "ubuntu-18.04" python.version: "3.9" - Python39Windows: - imageName: "vs2017-win2016" - python.version: "3.9" - Python39Mac: - imageName: "macos-10.14" - python.version: "3.9" + # Python39Windows: + # imageName: "windows-2019" + # python.version: "3.9" + # Python39Mac: + # imageName: "macos-10.14" + # python.version: "3.9" + Python310Linux: + imageName: "ubuntu-20.04" + python.version: "3.10" + Python310Windows: + imageName: "windows-2019" + python.version: "3.10" + Python310Mac: + imageName: "macos-10.15" + python.version: "3.10" maxParallel: 4 pool: vmImage: $(imageName) diff --git a/build-constraints.txt b/build-constraints.txt index 23e660096..cf5fe3284 100644 --- a/build-constraints.txt +++ b/build-constraints.txt @@ -2,4 +2,5 @@ numpy==1.15.0; python_version<='3.7' numpy==1.17.3; python_version=='3.8' numpy==1.19.3; python_version=='3.9' -numpy; python_version>='3.10' +numpy==1.21.3; python_version=='3.10' +numpy; python_version>='3.11' diff --git a/pyproject.toml b/pyproject.toml index cb103de0a..f81484d43 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.11,<8.1.0", + "thinc>=8.0.12,<8.1.0", "blis>=0.4.0,<0.8.0", "pathy", "numpy>=1.15.0", diff --git a/requirements.txt b/requirements.txt index 9bc39e323..36cf5c58e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ spacy-legacy>=3.0.8,<3.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.11,<8.1.0 +thinc>=8.0.12,<8.1.0 blis>=0.4.0,<0.8.0 ml_datasets>=0.2.0,<0.3.0 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index b2b7e6be3..e5b03afe6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -21,6 +21,7 @@ classifiers = Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 Topic :: Scientific/Engineering project_urls = Release notes = https://github.com/explosion/spaCy/releases @@ -37,14 +38,14 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.11,<8.1.0 + thinc>=8.0.12,<8.1.0 install_requires = # Our libraries spacy-legacy>=3.0.8,<3.1.0 murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.11,<8.1.0 + thinc>=8.0.12,<8.1.0 blis>=0.4.0,<0.8.0 wasabi>=0.8.1,<1.1.0 srsly>=2.4.1,<3.0.0 @@ -95,6 +96,10 @@ cuda111 = cupy-cuda111>=5.0.0b4,<10.0.0 cuda112 = cupy-cuda112>=5.0.0b4,<10.0.0 +cuda113 = + cupy-cuda113>=5.0.0b4,<10.0.0 +cuda114 = + cupy-cuda114>=5.0.0b4,<10.0.0 # Language tokenizers with external dependencies ja = sudachipy>=0.4.9 From 76173b0866d4f5ebd8fbcf941a1af606de1bc16f Mon Sep 17 00:00:00 2001 From: Philip Vollet Date: Fri, 29 Oct 2021 06:57:44 +0200 Subject: [PATCH 3/6] fixed typo and URL (#9560) --- website/meta/universe.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index df8077419..80608c77d 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1138,7 +1138,7 @@ { "id": "deplacy", "slogan": "CUI-based Tree Visualizer for Universal Dependencies and Immediate Catena Analysis", - "discreption": "Simple dependency visualizer for [spaCy](https://spacy.io/), [UniDic2UD](https://pypi.org/project/unidic2ud), [Stanza](https://stanfordnlp.github.io/stanza/), [NLP-Cube](https://github.com/Adobe/NLP-Cube), [Trankit](https://github.com/nlp-uoregon/trankit), etc.", + "description": "Simple dependency visualizer for [spaCy](https://spacy.io/), [UniDic2UD](https://pypi.org/project/unidic2ud), [Stanza](https://stanfordnlp.github.io/stanza/), [NLP-Cube](https://github.com/Adobe/NLP-Cube), [Trankit](https://github.com/nlp-uoregon/trankit), etc.", "github": "KoichiYasuoka/deplacy", "image": "https://i.imgur.com/6uOI4Op.png", "code_example": [ @@ -1270,7 +1270,7 @@ "description": "`textacy` is a Python library for performing a variety of natural language processing (NLP) tasks, built on the high-performance `spacy` library. With the fundamentals – tokenization, part-of-speech tagging, dependency parsing, etc. – delegated to another library, `textacy` focuses on the tasks that come before and follow after.", "github": "chartbeat-labs/textacy", "pip": "textacy", - "url": "https://chartbeat-labs.github.io/textacy/", + "url": "https://github.com/chartbeat-labs/textacy", "author": "Burton DeWilde", "author_links": { "github": "bdewilde", From 5477453ea374a88c7ef9bffb9dd0496035a9baa8 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 29 Oct 2021 10:35:31 +0200 Subject: [PATCH 4/6] Docs for thinc-apple-ops (#9549) * Docs for thinc-apple-ops * Ignore thinc-apple-ops in reqs tests * Fix install quickstart * Add cupy cuda 113, 114 extras * Remove draft section Co-authored-by: Ines Montani --- setup.cfg | 2 ++ spacy/tests/package/test_requirements.py | 1 + website/docs/usage/index.md | 15 ++++++++------- website/src/widgets/quickstart-install.js | 22 +++++++++++++++++++++- 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/setup.cfg b/setup.cfg index e5b03afe6..dc31228e5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -100,6 +100,8 @@ cuda113 = cupy-cuda113>=5.0.0b4,<10.0.0 cuda114 = cupy-cuda114>=5.0.0b4,<10.0.0 +apple = + thinc-apple-ops>=0.0.4,<1.0.0 # Language tokenizers with external dependencies ja = sudachipy>=0.4.9 diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index 1d51bd609..75908df59 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -25,6 +25,7 @@ def test_build_dependencies(): "sudachipy", "sudachidict_core", "spacy-pkuseg", + "thinc-apple-ops", ] # check requirements.txt diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index 707dd3215..54ab62467 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -71,13 +71,14 @@ spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included. > $ pip install %%SPACY_PKG_NAME[lookups,transformers]%%SPACY_PKG_FLAGS > ``` -| Name | Description | -| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lookups` | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. | -| `transformers` | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline. | -| `ray` | Install [`spacy-ray`](https://github.com/explosion/spacy-ray) to add CLI commands for [parallel training](/usage/training#parallel-training). | -| `cuda`, ... | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options. | -| `ja`, `ko`, `th`, `zh` | Install additional dependencies required for tokenization for the [languages](/usage/models#languages). | +| Name | Description | +| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `lookups` | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. | +| `transformers` | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline. | +| `ray` | Install [`spacy-ray`](https://github.com/explosion/spacy-ray) to add CLI commands for [parallel training](/usage/training#parallel-training). | +| `cuda`, ... | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options. | +| `apple` | Install [`thinc-apple-ops`](https://github.com/explosion/thinc-apple-ops) to improve performance on an Apple M1. | +| `ja`, `ko`, `th` | Install additional dependencies required for tokenization for the [languages](/usage/models#languages). | ### conda {#conda} diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js index 8ed602b72..628e1c533 100644 --- a/website/src/widgets/quickstart-install.js +++ b/website/src/widgets/quickstart-install.js @@ -4,10 +4,12 @@ import { StaticQuery, graphql } from 'gatsby' import { Quickstart, QS } from '../components/quickstart' import { repo, DEFAULT_BRANCH } from '../components/util' +const DEFAULT_OS = 'mac' +const DEFAULT_PLATFORM = 'x86' const DEFAULT_MODELS = ['en'] const DEFAULT_OPT = 'efficiency' const DEFAULT_HARDWARE = 'cpu' -const DEFAULT_CUDA = 'cuda102' +const DEFAULT_CUDA = 'cuda113' const CUDA = { '8.0': 'cuda80', '9.0': 'cuda90', @@ -19,11 +21,15 @@ const CUDA = { '11.0': 'cuda110', '11.1': 'cuda111', '11.2': 'cuda112', + '11.3': 'cuda113', + '11.4': 'cuda114', } const LANG_EXTRAS = ['ja'] // only for languages with models const QuickstartInstall = ({ id, title }) => { const [train, setTrain] = useState(false) + const [platform, setPlatform] = useState(DEFAULT_PLATFORM) + const [os, setOs] = useState(DEFAULT_OS) const [hardware, setHardware] = useState(DEFAULT_HARDWARE) const [cuda, setCuda] = useState(DEFAULT_CUDA) const [selectedModels, setModels] = useState(DEFAULT_MODELS) @@ -33,15 +39,19 @@ const QuickstartInstall = ({ id, title }) => { config: v => setTrain(v.includes('train')), models: setModels, optimize: v => setEfficiency(v.includes('efficiency')), + platform: v => setPlatform(v[0]), + os: v => setOs(v[0]), } const showDropdown = { hardware: () => hardware === 'gpu', } const modelExtras = train ? selectedModels.filter(m => LANG_EXTRAS.includes(m)) : [] + const apple = os === 'mac' && platform === 'arm' const pipExtras = [ hardware === 'gpu' && cuda, train && 'transformers', train && 'lookups', + apple && 'apple', ...modelExtras, ] .filter(e => e) @@ -62,6 +72,16 @@ const QuickstartInstall = ({ id, title }) => { { id: 'windows', title: 'Windows' }, { id: 'linux', title: 'Linux' }, ], + defaultValue: DEFAULT_OS, + }, + { + id: 'platform', + title: 'Platform', + options: [ + { id: 'x86', title: 'x86', checked: true }, + { id: 'arm', title: 'ARM / M1' }, + ], + defaultValue: DEFAULT_PLATFORM, }, { id: 'package', From 2fd8d616e77cd48a60007a4c64ca49d5833c1fee Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 29 Oct 2021 08:36:34 +0000 Subject: [PATCH 5/6] Add docs section for spacy.cli.train.train (#9545) * Add section for spacy.cli.train.train * Add link from training page to train function * Ensure path in train helper * Update docs Co-authored-by: Ines Montani --- spacy/cli/train.py | 8 +++++--- website/docs/api/cli.md | 23 +++++++++++++++++++++++ website/docs/api/top-level.md | 10 +++++----- website/docs/usage/training.md | 30 +++++++++++++++++++++++++++--- 4 files changed, 60 insertions(+), 11 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 664fc2aaf..cc22cbba6 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,4 +1,4 @@ -from typing import Optional, Dict, Any +from typing import Optional, Dict, Any, Union from pathlib import Path from wasabi import msg import typer @@ -46,12 +46,14 @@ def train_cli( def train( - config_path: Path, - output_path: Optional[Path] = None, + config_path: Union[str, Path], + output_path: Optional[Union[str, Path]] = None, *, use_gpu: int = -1, overrides: Dict[str, Any] = util.SimpleFrozenDict(), ): + config_path = util.ensure_path(config_path) + output_path = util.ensure_path(output_path) # Make sure all files and paths exists if they are needed if not config_path or (str(config_path) != "-" and not config_path.exists()): msg.fail("Config file not found", config_path, exits=1) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 268ea0703..a4462af56 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -819,6 +819,29 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | | **CREATES** | The final trained pipeline and the best trained pipeline. | +### Calling the training function from Python {#train-function new="3.2"} + +The training CLI exposes a `train` helper function that lets you run the +training just like `spacy train`. Usually it's easier to use the command line +directly, but if you need to kick off training from code this is how to do it. + +> #### Example +> +> ```python +> from spacy.cli.train import train +> +> train("./config.cfg", overrides={"paths.train": "./train.spacy", "paths.dev": "./dev.spacy"}) +> +> ``` + +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------------------------------------- | +| `config_path` | Path to the config to use for training. ~~Union[str, Path]~~ | +| `output_path` | Optional name of directory to save output model in. If not provided a model will not be saved. ~~Optional[Union[str, Path]]~~ | +| _keyword-only_ | | +| `use_gpu` | Which GPU to use. Defaults to -1 for no GPU. ~~int~~ | +| `overrides` | Values to override config settings. ~~Dict[str, Any]~~ | + ## pretrain {#pretrain new="2.1" tag="command,experimental"} Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index f6910bd5b..c78a1de03 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -826,17 +826,17 @@ from the specified model. Intended for use in `[initialize.before_init]`. > after_pipeline_creation = {"@callbacks":"spacy.models_with_nvtx_range.v1"} > ``` -Recursively wrap the models in each pipe using [NVTX](https://nvidia.github.io/NVTX/) -range markers. These markers aid in GPU profiling by attributing specific operations -to a ~~Model~~'s forward or backprop passes. +Recursively wrap the models in each pipe using +[NVTX](https://nvidia.github.io/NVTX/) range markers. These markers aid in GPU +profiling by attributing specific operations to a ~~Model~~'s forward or +backprop passes. | Name | Description | -|------------------|------------------------------------------------------------------------------------------------------------------------------| +| ---------------- | ---------------------------------------------------------------------------------------------------------------------------- | | `forward_color` | Color identifier for forward passes. Defaults to `-1`. ~~int~~ | | `backprop_color` | Color identifier for backpropagation passes. Defaults to `-1`. ~~int~~ | | **CREATES** | A function that takes the current `nlp` and wraps forward/backprop passes in NVTX ranges. ~~Callable[[Language], Language]~~ | - ## Training data and alignment {#gold source="spacy/training"} ### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"} diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 94fdad209..bd5ea7751 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -301,8 +301,6 @@ fly without having to save to and load from disk. $ python -m spacy init config - --lang en --pipeline ner,textcat --optimize accuracy | python -m spacy train - --paths.train ./corpus/train.spacy --paths.dev ./corpus/dev.spacy ``` - - ### Using variable interpolation {#config-interpolation} Another very useful feature of the config system is that it supports variable @@ -1647,7 +1645,7 @@ workers are stuck waiting for it to complete before they can continue. ## Internal training API {#api} - + spaCy gives you full control over the training loop. However, for most use cases, it's recommended to train your pipelines via the @@ -1659,6 +1657,32 @@ typically give you everything you need to train fully custom pipelines with +### Training from a Python script {#api-train new="3.2"} + +If you want to run the training from a Python script instead of using the +[`spacy train`](/api/cli#train) CLI command, you can call into the +[`train`](/api/cli#train-function) helper function directly. It takes the path +to the config file, an optional output directory and an optional dictionary of +[config overrides](#config-overrides). + +```python +from spacy.cli.train import train + +train("./config.cfg", overrides={"paths.train": "./train.spacy", "paths.dev": "./dev.spacy"}) +``` + +### Internal training loop API {#api-loop} + + + +This section documents how the training loop and updates to the `nlp` object +work internally. You typically shouldn't have to implement this in Python unless +you're writing your own trainable components. To train a pipeline, use +[`spacy train`](/api/cli#train) or the [`train`](/api/cli#train-function) helper +function instead. + + + The [`Example`](/api/example) object contains annotated training data, also called the **gold standard**. It's initialized with a [`Doc`](/api/doc) object that will hold the predictions, and another `Doc` object that holds the From 006df1ae1fe2d745dec19cacb3ca71d06447a7aa Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 29 Oct 2021 10:08:40 +0000 Subject: [PATCH 6/6] Clarify error when words are of wrong type (#9541) * Clarify error when words are of wrong type See #9437 * Update docs * Use try/except * Apply suggestions from code review Co-authored-by: Sofie Van Landeghem Co-authored-by: Adriane Boyd Co-authored-by: Sofie Van Landeghem Co-authored-by: Adriane Boyd --- spacy/errors.py | 1 + spacy/tokens/doc.pyx | 16 ++++++++++------ website/docs/api/doc.md | 2 +- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index e6912a263..ff1185361 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -877,6 +877,7 @@ class Errors: "filename. Specify an epoch to resume from.") E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. " "Non-UD tags should use the `tag` property.") + E1022 = ("Words must be of type str or int, but input is of type '{wtype}'") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 5ea3e1e3b..1ee845934 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -194,11 +194,12 @@ cdef class Doc: vocab (Vocab): A vocabulary object, which must match any models you want to use (e.g. tokenizer, parser, entity recognizer). - words (Optional[List[str]]): A list of unicode strings to add to the document - as words. If `None`, defaults to empty list. - spaces (Optional[List[bool]]): A list of boolean values, of the same length as - words. True means that the word is followed by a space, False means - it is not. If `None`, defaults to `[True]*len(words)` + words (Optional[List[Union[str, int]]]): A list of unicode strings or + hash values to add to the document as words. If `None`, defaults to + empty list. + spaces (Optional[List[bool]]): A list of boolean values, of the same + length as `words`. `True` means that the word is followed by a space, + `False` means it is not. If `None`, defaults to `[True]*len(words)` user_data (dict or None): Optional extra data to attach to the Doc. tags (Optional[List[str]]): A list of unicode strings, of the same length as words, to assign as token.tag. Defaults to None. @@ -266,7 +267,10 @@ cdef class Doc: elif isinstance(word, bytes): raise ValueError(Errors.E028.format(value=word)) else: - lexeme = self.vocab.get_by_orth(self.mem, word) + try: + lexeme = self.vocab.get_by_orth(self.mem, word) + except TypeError: + raise TypeError(Errors.E1022.format(wtype=type(word))) self.push_back(lexeme, has_space) if heads is not None: diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index e1f18963b..9836b8c21 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -34,7 +34,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the | Name | Description | | ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `vocab` | A storage container for lexical types. ~~Vocab~~ | -| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ | +| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ | | `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ | | _keyword-only_ | | | `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |