diff --git a/website/README.md b/website/README.md index 10a75161b..825d13c65 100644 --- a/website/README.md +++ b/website/README.md @@ -609,7 +609,6 @@ In addition to the native markdown elements, you can use the components ├── docs # the actual markdown content ├── meta # JSON-formatted site metadata | ├── languages.json # supported languages and statistical models -| ├── logos.json # logos and links for landing page | ├── sidebars.json # sidebar navigations for different sections | ├── site.json # general site metadata | └── universe.json # data for the spaCy universe section diff --git a/website/docs/api/attributeruler.md b/website/docs/api/attributeruler.md index fc72eda98..53c8c46cf 100644 --- a/website/docs/api/attributeruler.md +++ b/website/docs/api/attributeruler.md @@ -38,7 +38,7 @@ how the component should be configured. You can override its settings via the | `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/attributeruler.py +%%GITHUB_SPACY/spacy/pipeline/attributeruler.py ``` ## AttributeRuler.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 47af9be96..55e552e72 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -229,13 +229,13 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type] ### Converters {#converters} -| ID | Description | -| ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `auto` | Automatically pick converter based on file extension and file content (default). | -| `json` | JSON-formatted training data used in spaCy v2.x. | -| `conll` | Universal Dependencies `.conllu` or `.conll` format. | -| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | -| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | +| ID | Description | +| ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `auto` | Automatically pick converter based on file extension and file content (default). | +| `json` | JSON-formatted training data used in spaCy v2.x. | +| `conll` | Universal Dependencies `.conllu` or `.conll` format. | +| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). | +| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). | ## debug {#debug new="3"} @@ -358,37 +358,37 @@ File /path/to/spacy/ml/models/tok2vec.py (line 207) Registry @loggers Name spacy.ConsoleLogger.v1 Module spacy.training.loggers -File /path/to/spacy/gold/loggers.py (line 8) +File /path/to/spacy/training/loggers.py (line 8) ℹ [training.batcher] Registry @batchers Name spacy.batch_by_words.v1 Module spacy.training.batchers -File /path/to/spacy/gold/batchers.py (line 49) +File /path/to/spacy/training/batchers.py (line 49) ℹ [training.batcher.size] Registry @schedules Name compounding.v1 Module thinc.schedules -File /Users/ines/Repos/explosion/thinc/thinc/schedules.py (line 43) +File /path/to/thinc/thinc/schedules.py (line 43) ℹ [training.dev_corpus] Registry @readers Name spacy.Corpus.v1 Module spacy.training.corpus -File /path/to/spacy/gold/corpus.py (line 18) +File /path/to/spacy/training/corpus.py (line 18) ℹ [training.optimizer] Registry @optimizers Name Adam.v1 Module thinc.optimizers -File /Users/ines/Repos/explosion/thinc/thinc/optimizers.py (line 58) +File /path/to/thinc/thinc/optimizers.py (line 58) ℹ [training.optimizer.learn_rate] Registry @schedules Name warmup_linear.v1 Module thinc.schedules -File /Users/ines/Repos/explosion/thinc/thinc/schedules.py (line 91) +File /path/to/thinc/thinc/schedules.py (line 91) ℹ [training.train_corpus] Registry @readers Name spacy.Corpus.v1 Module spacy.training.corpus -File /path/to/spacy/gold/corpus.py (line 18) +File /path/to/spacy/training/corpus.py (line 18) ``` diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index b913d9a05..f6f6bbf68 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -2,7 +2,7 @@ title: Corpus teaser: An annotated corpus tag: class -source: spacy/gold/corpus.py +source: spacy/training/corpus.py new: 3 --- @@ -42,7 +42,7 @@ streaming. | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/gold/corpus.py +%%GITHUB_SPACY/spacy/training/corpus.py ``` ## Corpus.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/cython.md b/website/docs/api/cython.md index d7c03cf41..16b11cead 100644 --- a/website/docs/api/cython.md +++ b/website/docs/api/cython.md @@ -23,12 +23,12 @@ abruptly. With Cython there are four ways of declaring complex data types. Unfortunately we use all four in different places, as they all have different utility: -| Declaration | Description | Example | -| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------- | -| `class` | A normal Python class. | [`Language`](/api/language) | -| `cdef class` | A Python extension type. Differs from a normal Python class in that its attributes can be defined on the underlying struct. Can have C-level objects as attributes (notably structs and pointers), and can have methods which have C-level objects as arguments or return types. | [`Lexeme`](/api/cython-classes#lexeme) | -| `cdef struct` | A struct is just a collection of variables, sort of like a named tuple, except the memory is contiguous. Structs can't have methods, only attributes. | [`LexemeC`](/api/cython-structs#lexemec) | -| `cdef cppclass` | A C++ class. Like a struct, this can be allocated on the stack, but can have methods, a constructor and a destructor. Differs from `cdef class` in that it can be created and destroyed without acquiring the Python global interpreter lock. This style is the most obscure. | [`StateC`](https://github.com/explosion/spaCy/tree/master/spacy/syntax/_state.pxd) | +| Declaration | Description | Example | +| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------- | +| `class` | A normal Python class. | [`Language`](/api/language) | +| `cdef class` | A Python extension type. Differs from a normal Python class in that its attributes can be defined on the underlying struct. Can have C-level objects as attributes (notably structs and pointers), and can have methods which have C-level objects as arguments or return types. | [`Lexeme`](/api/cython-classes#lexeme) | +| `cdef struct` | A struct is just a collection of variables, sort of like a named tuple, except the memory is contiguous. Structs can't have methods, only attributes. | [`LexemeC`](/api/cython-structs#lexemec) | +| `cdef cppclass` | A C++ class. Like a struct, this can be allocated on the stack, but can have methods, a constructor and a destructor. Differs from `cdef class` in that it can be created and destroyed without acquiring the Python global interpreter lock. This style is the most obscure. | [`StateC`](%%GITHUB_SPACY/spacy/pipeline/_parser_internals/_state.pxd) | The most important classes in spaCy are defined as `cdef class` objects. The underlying data for these objects is usually gathered into a struct, which is diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 6a3b528c6..3d78df39d 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -37,7 +37,7 @@ recommended settings for your use case, check out the > guide on [registered functions](/usage/training#config-functions) for details. ```ini -https://github.com/explosion/spaCy/blob/develop/spacy/default_config.cfg +%%GITHUB_SPACY/spacy/default_config.cfg ``` @@ -45,8 +45,7 @@ https://github.com/explosion/spaCy/blob/develop/spacy/default_config.cfg Under the hood, spaCy's configs are powered by our machine learning library [Thinc's config system](https://thinc.ai/docs/usage-config), which uses [`pydantic`](https://github.com/samuelcolvin/pydantic/) for data validation -based on type hints. See -[`spacy/schemas.py`](https://github.com/explosion/spaCy/blob/develop/spacy/schemas.py) +based on type hints. See [`spacy/schemas.py`](%%GITHUB_SPACY/spacy/schemas.py) for the schemas used to validate the default config. Arguments of registered functions are validated against their type annotations, if available. To debug your config and check that it's valid, you can run the @@ -456,7 +455,7 @@ lexical data. Here's an example of the 20 most frequent lexemes in the English training data: ```json -https://github.com/explosion/spaCy/tree/master/examples/training/vocab-data.jsonl +%%GITHUB_SPACY / extra / example_data / vocab - data.jsonl ``` ## Pipeline meta {#meta} diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index 5bd2ea8ad..674812567 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -57,7 +57,7 @@ architectures and their arguments and hyperparameters. | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/dep_parser.pyx +%%GITHUB_SPACY/spacy/pipeline/dep_parser.pyx ``` ## DependencyParser.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 8cde6c490..a9d45d68e 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -50,7 +50,7 @@ architectures and their arguments and hyperparameters. | `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py +%%GITHUB_SPACY/spacy/pipeline/entity_linker.py ``` ## EntityLinker.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 9189fe763..1420aa1a7 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -48,7 +48,7 @@ architectures and their arguments and hyperparameters. | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/ner.pyx +%%GITHUB_SPACY/spacy/pipeline/ner.pyx ``` ## EntityRecognizer.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index 454b2a04b..a6934eeef 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -42,7 +42,7 @@ how the component should be configured. You can override its settings via the | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entityruler.py +%%GITHUB_SPACY/spacy/pipeline/entityruler.py ``` ## EntityRuler.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/example.md b/website/docs/api/example.md index 132e9e8f5..668c8028f 100644 --- a/website/docs/api/example.md +++ b/website/docs/api/example.md @@ -2,7 +2,7 @@ title: Example teaser: A training instance tag: class -source: spacy/gold/example.pyx +source: spacy/training/example.pyx new: 3.0 --- diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 530f7740d..c24023177 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -944,11 +944,11 @@ available to the loaded object. ## Class attributes {#class-attributes} -| Name | Description | -| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ | -| `lang` | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). ~~str~~ | -| `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](https://github.com/explosion/spaCy/tree/develop/spacy/default_config.cfg). ~~Config~~ | +| Name | Description | +| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ | +| `lang` | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). ~~str~~ | +| `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](%%GITHUB_SPACY/spacy/default_config.cfg). ~~Config~~ | ## Defaults {#defaults} @@ -981,34 +981,17 @@ customize the default language data: > config = Config().from_str(DEFAULT_CONFIG) > ``` -| Name | Description | -| --------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `stop_words` | List of stop words, used for `Token.is_stop`.
**Example:** [`stop_words.py`][stop_words.py] ~~Set[str]~~ | -| `tokenizer_exceptions` | Tokenizer exception rules, string mapped to list of token attributes.
**Example:** [`de/tokenizer_exceptions.py`][de/tokenizer_exceptions.py] ~~Dict[str, List[dict]]~~ | -| `prefixes`, `suffixes`, `infixes` | Prefix, suffix and infix rules for the default tokenizer.
**Example:** [`puncutation.py`][punctuation.py] ~~Optional[List[Union[str, Pattern]]]~~ | -| `token_match` | Optional regex for matching strings that should never be split, overriding the infix rules.
**Example:** [`fr/tokenizer_exceptions.py`][fr/tokenizer_exceptions.py] ~~Optional[Pattern]~~ | -| `url_match` | Regular expression for matching URLs. Prefixes and suffixes are removed before applying the match.
**Example:** [`tokenizer_exceptions.py`][tokenizer_exceptions.py] ~~Optional[Pattern]~~ | -| `lex_attr_getters` | Custom functions for setting lexical attributes on tokens, e.g. `like_num`.
**Example:** [`lex_attrs.py`][lex_attrs.py] ~~Dict[int, Callable[[str], Any]]~~ | -| `syntax_iterators` | Functions that compute views of a `Doc` object based on its syntax. At the moment, only used for [noun chunks](/usage/linguistic-features#noun-chunks).
**Example:** [`syntax_iterators.py`][syntax_iterators.py]. ~~Dict[str, Callable[[Union[Doc, Span]], Iterator[Span]]]~~ | -| `writing_system` | Information about the language's writing system, available via `Vocab.writing_system`. Defaults to: `{"direction": "ltr", "has_case": True, "has_letters": True}.`.
**Example:** [`zh/__init__.py`][zh/__init__.py] ~~Dict[str, Any]~~ | -| `config` | Default [config](/usage/training#config) added to `nlp.config`. This can include references to custom tokenizers or lemmatizers.
**Example:** [`zh/__init__.py`][zh/__init__.py] ~~Config~~ | - -[stop_words.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/en/stop_words.py -[tokenizer_exceptions.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/tokenizer_exceptions.py -[de/tokenizer_exceptions.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/de/tokenizer_exceptions.py -[fr/tokenizer_exceptions.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/fr/tokenizer_exceptions.py -[punctuation.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py -[lex_attrs.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/en/lex_attrs.py -[syntax_iterators.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/en/syntax_iterators.py -[zh/__init__.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/zh/__init__.py +| Name | Description | +| --------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `stop_words` | List of stop words, used for `Token.is_stop`.
**Example:** [`stop_words.py`](%%GITHUB_SPACY/spacy/lang/en/stop_words.py) ~~Set[str]~~ | +| `tokenizer_exceptions` | Tokenizer exception rules, string mapped to list of token attributes.
**Example:** [`de/tokenizer_exceptions.py`](%%GITHUB_SPACY/spacy/lang/de/tokenizer_exceptions.py) ~~Dict[str, List[dict]]~~ | +| `prefixes`, `suffixes`, `infixes` | Prefix, suffix and infix rules for the default tokenizer.
**Example:** [`puncutation.py`](%%GITHUB_SPACY/spacy/lang/punctuation.py) ~~Optional[List[Union[str, Pattern]]]~~ | +| `token_match` | Optional regex for matching strings that should never be split, overriding the infix rules.
**Example:** [`fr/tokenizer_exceptions.py`](%%GITHUB_SPACY/spacy/lang/fr/tokenizer_exceptions.py) ~~Optional[Pattern]~~ | +| `url_match` | Regular expression for matching URLs. Prefixes and suffixes are removed before applying the match.
**Example:** [`tokenizer_exceptions.py`](%%GITHUB_SPACY/spacy/lang/tokenizer_exceptions.py) ~~Optional[Pattern]~~ | +| `lex_attr_getters` | Custom functions for setting lexical attributes on tokens, e.g. `like_num`.
**Example:** [`lex_attrs.py`](%%GITHUB_SPACY/spacy/lang/en/lex_attrs.py) ~~Dict[int, Callable[[str], Any]]~~ | +| `syntax_iterators` | Functions that compute views of a `Doc` object based on its syntax. At the moment, only used for [noun chunks](/usage/linguistic-features#noun-chunks).
**Example:** [`syntax_iterators.py`](%%GITHUB_SPACY/spacy/lang/en/syntax_iterators.py). ~~Dict[str, Callable[[Union[Doc, Span]], Iterator[Span]]]~~ | +| `writing_system` | Information about the language's writing system, available via `Vocab.writing_system`. Defaults to: `{"direction": "ltr", "has_case": True, "has_letters": True}.`.
**Example:** [`zh/__init__.py`](%%GITHUB_SPACY/spacy/lang/zh/__init__.py) ~~Dict[str, Any]~~ | +| `config` | Default [config](/usage/training#config) added to `nlp.config`. This can include references to custom tokenizers or lemmatizers.
**Example:** [`zh/__init__.py`](%%GITHUB_SPACY/spacy/lang/zh/__init__.py) ~~Config~~ | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index 45a8736db..486410907 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -56,7 +56,7 @@ data formats used by the lookup and rule-based lemmatizers, see | `model` | **Not yet implemented:** the model to use. ~~Model~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/lemmatizer.py +%%GITHUB_SPACY/spacy/pipeline/lemmatizer.py ``` ## Lemmatizer.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index c4787c050..f2b2f9cc0 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -37,7 +37,7 @@ architectures and their arguments and hyperparameters. | `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/morphologizer.pyx +%%GITHUB_SPACY/spacy/pipeline/morphologizer.pyx ``` ## Morphologizer.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md index ebbf9ccc4..c8d61a5a9 100644 --- a/website/docs/api/pipe.md +++ b/website/docs/api/pipe.md @@ -22,7 +22,7 @@ for how to use the `Pipe` base class to implement custom components. > inherit from `Pipe`. ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/pipe.pyx +%%GITHUB_SPACY/spacy/pipeline/pipe.pyx ``` ## Pipe.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md index 3d9f61e8d..ca19327bb 100644 --- a/website/docs/api/sentencerecognizer.md +++ b/website/docs/api/sentencerecognizer.md @@ -34,7 +34,7 @@ architectures and their arguments and hyperparameters. | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/senter.pyx +%%GITHUB_SPACY/spacy/pipeline/senter.pyx ``` ## SentenceRecognizer.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md index 8104b1151..c435acdcb 100644 --- a/website/docs/api/sentencizer.md +++ b/website/docs/api/sentencizer.md @@ -33,7 +33,7 @@ how the component should be configured. You can override its settings via the | `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. Defaults to `None`. ~~Optional[List[str]]~~ | `None` | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/sentencizer.pyx +%%GITHUB_SPACY/spacy/pipeline/sentencizer.pyx ``` ## Sentencizer.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index 06def58d5..d83a77357 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -34,7 +34,7 @@ architectures and their arguments and hyperparameters. | `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tagger.pyx +%%GITHUB_SPACY/spacy/pipeline/tagger.pyx ``` ## Tagger.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index b296c95ca..cc20d6fd2 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -41,7 +41,7 @@ architectures and their arguments and hyperparameters. | `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/textcat.py +%%GITHUB_SPACY/spacy/pipeline/textcat.py ``` ## TextCategorizer.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md index deb8369ab..6f13a17a5 100644 --- a/website/docs/api/tok2vec.md +++ b/website/docs/api/tok2vec.md @@ -45,7 +45,7 @@ architectures and their arguments and hyperparameters. | `model` | The model to use. Defaults to [HashEmbedCNN](/api/architectures#HashEmbedCNN). ~~Model[List[Doc], List[Floats2d]~~ | ```python -https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tok2vec.py +%%GITHUB_SPACY/spacy/pipeline/tok2vec.py ``` ## Tok2Vec.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 7f66abb5f..38e2299fa 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -105,8 +105,7 @@ your installation, installed pipelines and local setup from within spaCy. ### spacy.explain {#spacy.explain tag="function"} Get a description for a given POS tag, dependency label or entity type. For a -list of available terms, see -[`glossary.py`](https://github.com/explosion/spaCy/tree/master/spacy/glossary.py). +list of available terms, see [`glossary.py`](%%GITHUB_SPACY/spacy/glossary.py). > #### Example > @@ -262,11 +261,11 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="ent", options=options) > ``` -| Name | Description | -| --------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `ents` | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~ | -| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ | -| `template` 2.2 | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ | +| Name | Description | +| --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `ents` | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~ | +| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ | +| `template` 2.2 | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ | By default, displaCy comes with colors for all entity types used by [spaCy's trained pipelines](/models). If you're using custom entity types, you @@ -348,7 +347,7 @@ See the [`Transformer`](/api/transformer) API reference and | [`span_getters`](/api/transformer#span_getters) | Registry for functions that take a batch of `Doc` objects and return a list of `Span` objects to process by the transformer, e.g. sentences. | | [`annotation_setters`](/api/transformer#annotation_setters) | Registry for functions that create annotation setters. Annotation setters are functions that take a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. | -## Loggers {#loggers source="spacy/gold/loggers.py" new="3"} +## Loggers {#loggers source="spacy/training/loggers.py" new="3"} A logger records the training results. When a logger is created, two functions are returned: one for logging the information for each training step, and a @@ -452,7 +451,7 @@ remain in the config file stored on your local system. | `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ | | `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ | -## Batchers {#batchers source="spacy/gold/batchers.py" new="3"} +## Batchers {#batchers source="spacy/training/batchers.py" new="3"} A data batcher implements a batching strategy that essentially turns a stream of items into a stream of batches, with each batch consisting of one item or a list @@ -536,7 +535,7 @@ sequences in the batch. | `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~ | | `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ | -## Training data and alignment {#gold source="spacy/gold"} +## Training data and alignment {#gold source="spacy/training"} ### training.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"} @@ -616,12 +615,12 @@ token-based tags, e.g. to overwrite the `doc.ents`. ## Utility functions {#util source="spacy/util.py"} spaCy comes with a small collection of utility functions located in -[`spacy/util.py`](https://github.com/explosion/spaCy/tree/master/spacy/util.py). -Because utility functions are mostly intended for **internal use within spaCy**, -their behavior may change with future releases. The functions documented on this -page should be safe to use and we'll try to ensure backwards compatibility. -However, we recommend having additional tests in place if your application -depends on any of spaCy's utilities. +[`spacy/util.py`](%%GITHUB_SPACY/spacy/util.py). Because utility functions are +mostly intended for **internal use within spaCy**, their behavior may change +with future releases. The functions documented on this page should be safe to +use and we'll try to ensure backwards compatibility. However, we recommend +having additional tests in place if your application depends on any of spaCy's +utilities. ### util.get_lang_class {#util.get_lang_class tag="function"} @@ -832,10 +831,10 @@ Compile a sequence of prefix rules into a regex object. > nlp.tokenizer.prefix_search = prefix_regex.search > ``` -| Name | Description | -| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `entries` | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | -| **RETURNS** | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- | +| `entries` | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | +| **RETURNS** | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ | ### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"} @@ -849,10 +848,10 @@ Compile a sequence of suffix rules into a regex object. > nlp.tokenizer.suffix_search = suffix_regex.search > ``` -| Name | Description | -| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `entries` | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | -| **RETURNS** | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- | +| `entries` | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | +| **RETURNS** | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ | ### util.compile_infix_regex {#util.compile_infix_regex tag="function"} @@ -866,10 +865,10 @@ Compile a sequence of infix rules into a regex object. > nlp.tokenizer.infix_finditer = infix_regex.finditer > ``` -| Name | Description | -| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `entries` | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | -| **RETURNS** | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ | +| Name | Description | +| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------- | +| `entries` | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | +| **RETURNS** | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ | ### util.minibatch {#util.minibatch tag="function" new="2"} diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index fc8a8deef..d5bcef229 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -31,7 +31,7 @@ supports all models that are available via the Usually you will connect subsequent components to the shared transformer using the [TransformerListener](/api/architectures#TransformerListener) layer. This works similarly to spaCy's [Tok2Vec](/api/tok2vec) component and -[Tok2VecListener](/api/architectures/Tok2VecListener) sublayer. +[Tok2VecListener](/api/architectures/#Tok2VecListener) sublayer. The component assigns the output of the transformer to the `Doc`'s extension attributes. We also calculate an alignment between the word-piece tokens and the diff --git a/website/docs/images/prodigy_overview.jpg b/website/docs/images/prodigy_overview.jpg new file mode 100644 index 000000000..84326ccea Binary files /dev/null and b/website/docs/images/prodigy_overview.jpg differ diff --git a/website/docs/images/projects.png b/website/docs/images/projects.png new file mode 100644 index 000000000..934e98e0a Binary files /dev/null and b/website/docs/images/projects.png differ diff --git a/website/docs/images/wandb1.jpg b/website/docs/images/wandb1.jpg new file mode 100644 index 000000000..3baf4aba0 Binary files /dev/null and b/website/docs/images/wandb1.jpg differ diff --git a/website/docs/images/wandb2.jpg b/website/docs/images/wandb2.jpg new file mode 100644 index 000000000..cd67c9aa4 Binary files /dev/null and b/website/docs/images/wandb2.jpg differ diff --git a/website/docs/usage/101/_language-data.md b/website/docs/usage/101/_language-data.md index f1fa1f3a2..239cec9d1 100644 --- a/website/docs/usage/101/_language-data.md +++ b/website/docs/usage/101/_language-data.md @@ -2,9 +2,8 @@ Every language is different – and usually full of **exceptions and special cases**, especially amongst the most common words. Some of these exceptions are shared across languages, while others are **entirely specific** – usually so specific that they need to be hard-coded. The -[`lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang) module -contains all language-specific data, organized in simple Python files. This -makes the data easy to update and extend. +[`lang`](%%GITHUB_SPACY/spacy/lang) module contains all language-specific data, +organized in simple Python files. This makes the data easy to update and extend. The **shared language data** in the directory root includes rules that can be generalized across languages – for example, rules for basic punctuation, emoji, @@ -22,28 +21,12 @@ values are defined in the [`Language.Defaults`](/api/language#defaults). > nlp_de = German() # Includes German data > ``` -| Name | Description | -| ----------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **Stop words**
[`stop_words.py`][stop_words.py] | List of most common words of a language that are often useful to filter out, for example "and" or "I". Matching tokens will return `True` for `is_stop`. | -| **Tokenizer exceptions**
[`tokenizer_exceptions.py`][tokenizer_exceptions.py] | Special-case rules for the tokenizer, for example, contractions like "can't" and abbreviations with punctuation, like "U.K.". | -| **Punctuation rules**
[`punctuation.py`][punctuation.py] | Regular expressions for splitting tokens, e.g. on punctuation or special characters like emoji. Includes rules for prefixes, suffixes and infixes. | -| **Character classes**
[`char_classes.py`][char_classes.py] | Character classes to be used in regular expressions, for example, Latin characters, quotes, hyphens or icons. | -| **Lexical attributes**
[`lex_attrs.py`][lex_attrs.py] | Custom functions for setting lexical attributes on tokens, e.g. `like_num`, which includes language-specific words like "ten" or "hundred". | -| **Syntax iterators**
[`syntax_iterators.py`][syntax_iterators.py] | Functions that compute views of a `Doc` object based on its syntax. At the moment, only used for [noun chunks](/usage/linguistic-features#noun-chunks). | -| **Lemmatizer**
[`lemmatizer.py`][lemmatizer.py] [`spacy-lookups-data`][spacy-lookups-data] | Custom lemmatizer implementation and lemmatization tables. | - -[stop_words.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/en/stop_words.py -[tokenizer_exceptions.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/de/tokenizer_exceptions.py -[punctuation.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py -[char_classes.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/char_classes.py -[lex_attrs.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/en/lex_attrs.py -[syntax_iterators.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/en/syntax_iterators.py -[lemmatizer.py]: - https://github.com/explosion/spaCy/tree/master/spacy/lang/fr/lemmatizer.py -[spacy-lookups-data]: https://github.com/explosion/spacy-lookups-data +| Name | Description | +| ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **Stop words**
[`stop_words.py`](%%GITHUB_SPACY/spacy/lang/en/stop_words.py) | List of most common words of a language that are often useful to filter out, for example "and" or "I". Matching tokens will return `True` for `is_stop`. | +| **Tokenizer exceptions**
[`tokenizer_exceptions.py`](%%GITHUB_SPACY/spacy/lang/de/tokenizer_exceptions.py) | Special-case rules for the tokenizer, for example, contractions like "can't" and abbreviations with punctuation, like "U.K.". | +| **Punctuation rules**
[`punctuation.py`](%%GITHUB_SPACY/spacy/lang/punctuation.py) | Regular expressions for splitting tokens, e.g. on punctuation or special characters like emoji. Includes rules for prefixes, suffixes and infixes. | +| **Character classes**
[`char_classes.py`](%%GITHUB_SPACY/spacy/lang/char_classes.py) | Character classes to be used in regular expressions, for example, Latin characters, quotes, hyphens or icons. | +| **Lexical attributes**
[`lex_attrs.py`](%%GITHUB_SPACY/spacy/lang/en/lex_attrs.py) | Custom functions for setting lexical attributes on tokens, e.g. `like_num`, which includes language-specific words like "ten" or "hundred". | +| **Syntax iterators**
[`syntax_iterators.py`](%%GITHUB_SPACY/spacy/lang/en/syntax_iterators.py) | Functions that compute views of a `Doc` object based on its syntax. At the moment, only used for [noun chunks](/usage/linguistic-features#noun-chunks). | +| **Lemmatizer**
[`lemmatizer.py`](%%GITHUB_SPACY/master/spacy/lang/fr/lemmatizer.py) [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) | Custom lemmatizer implementation and lemmatization tables. | diff --git a/website/docs/usage/_benchmarks-choi.md b/website/docs/usage/_benchmarks-choi.md deleted file mode 100644 index 47d6f479f..000000000 --- a/website/docs/usage/_benchmarks-choi.md +++ /dev/null @@ -1,10 +0,0 @@ -import { Help } from 'components/typography' - -| System | Year | Language | Accuracy | Speed (wps) | -| -------------- | ---- | --------------- | -------: | -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| **spaCy v2.x** | 2017 | Python / Cython | **92.6** | _n/a_ This table shows speed as benchmarked by Choi et al. We therefore can't provide comparable figures, as we'd be running the benchmark on different hardware. | -| **spaCy v1.x** | 2015 | Python / Cython | 91.8 | 13,963 | -| ClearNLP | 2015 | Java | 91.7 | 10,271 | -| CoreNLP | 2015 | Java | 89.6 | 8,602 | -| MATE | 2015 | Java | 92.5 | 550 | -| Turbo | 2015 | C++ | 92.4 | 349 | diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md new file mode 100644 index 000000000..0c04dd8d5 --- /dev/null +++ b/website/docs/usage/_benchmarks-models.md @@ -0,0 +1,44 @@ +import { Help } from 'components/typography'; import Link from 'components/link' + + + +
+ +| System | Parser | Tagger | NER | WPS
CPU words per second on CPU, higher is better | WPS
GPU words per second on GPU, higher is better | +| ------------------------------------------------------------------------- | ----------------: | ----------------: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: | +| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | | | | | 6k | +| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | | | | | | +| `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | 85.9 | 10k | | +| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)1 | _n/a_2 | _n/a_2 | 88.8 | 234 | 2k | +| Flair | - | 97.9 | 89.3 | | | + +
+ +**Accuracy and speed on the +[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.**
**1. ** +[Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf). **2. ** _Coming soon_: +Qi et al. don't report parsing and tagging results on OntoNotes. We're working +on training Stanza on this corpus to allow direct comparison. + +
+ +
+ +
+ +| System | POS | USA | LAS | +| ------------------------------------------------------------------------------ | ---: | ---: | ---: | +| spaCy RoBERTa (2020) | | | | +| spaCy CNN (2020) | | | | +| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 | +| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.3 | 97.2 | 95.7 | + +
+ +**Accuracy on the Penn Treebank.** See +[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more +results. + +
+ +
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 5215c0ae5..8dd104ead 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -579,12 +579,17 @@ def MyCustomVectors( ## Pretraining {#pretraining} + + + + > #### Raw text format > diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md index e2549ecfc..82db59a2e 100644 --- a/website/docs/usage/facts-figures.md +++ b/website/docs/usage/facts-figures.md @@ -5,254 +5,55 @@ next: /usage/spacy-101 menu: - ['Feature Comparison', 'comparison'] - ['Benchmarks', 'benchmarks'] + # TODO: - ['Citing spaCy', 'citation'] --- -## Feature comparison {#comparison} +## Comparison {#comparison hidden="true"} -Here's a quick comparison of the functionalities offered by spaCy, -[NLTK](http://www.nltk.org/py-modindex.html) and -[CoreNLP](http://stanfordnlp.github.io/CoreNLP/). +### When should I use spaCy? {#comparison-usage} -| | spaCy | NLTK | CoreNLP | -| ----------------------- | :----: | :----: | :-----------: | -| Programming language | Python | Python | Java / Python | -| Neural network models | ✅ | ❌ | ✅ | -| Integrated word vectors | ✅ | ❌ | ❌ | -| Multi-language support | ✅ | ✅ | ✅ | -| Tokenization | ✅ | ✅ | ✅ | -| Part-of-speech tagging | ✅ | ✅ | ✅ | -| Sentence segmentation | ✅ | ✅ | ✅ | -| Dependency parsing | ✅ | ❌ | ✅ | -| Entity recognition | ✅ | ✅ | ✅ | -| Entity linking | ✅ | ❌ | ❌ | -| Coreference resolution | ❌ | ❌ | ✅ | + -### When should I use what? {#comparison-usage} - -Natural Language Understanding is an active area of research and development, so -there are many different tools or technologies catering to different use-cases. -The table below summarizes a few libraries (spaCy, -[NLTK](http://www.nltk.org/py-modindex.html), [AllenNLP](https://allennlp.org/), -[StanfordNLP](https://stanfordnlp.github.io/stanfordnlp/) and -[TensorFlow](https://www.tensorflow.org/)) to help you get a feel for things fit -together. - -| | spaCy | NLTK | Allen-
NLP | Stanford-
NLP | Tensor-
Flow | -| ----------------------------------------------------------------- | :---: | :--: | :-------------: | :----------------: | :---------------: | -| I'm a beginner and just getting started with NLP. | ✅ | ✅ | ❌ | ✅ | ❌ | -| I want to build an end-to-end production application. | ✅ | ❌ | ❌ | ❌ | ✅ | -| I want to try out different neural network architectures for NLP. | ❌ | ❌ | ✅ | ❌ | ✅ | -| I want to try the latest models with state-of-the-art accuracy. | ❌ | ❌ | ✅ | ✅ | ✅ | -| I want to train models from my own data. | ✅ | ✅ | ✅ | ✅ | ✅ | -| I want my application to be efficient on CPU. | ✅ | ✅ | ❌ | ❌ | ❌ | +| Use Cases | +| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| ✅ **I'm a beginner and just getting started with NLP.**
spaCy makes it easy to get started and comes with extensive documentation, including a beginner-friendly [101 guide](/usage/spacy-101) and a free interactive [online course](https://course.spacy.io). | +| ✅ **I want to build an end-to-end production application.** | +| ✅ **I want my application to be efficient on CPU.**
While spaCy lets you train modern NLP models that are best run on GPU, it also offers CPU-optimized pipelines, which may be less accurate but much cheaper to run. | +| ✅ **I want to try out different neural network architectures for NLP.** | +| ❌ **I want to build a language generation application.**
spaCy's focus is natural language _processing_ and extracting information from large volumes of text. While you can use it to help you re-write existing text, it doesn't include any specific functionality for language generation tasks. | +| ❌ **I want to research machine learning algorithms.** | ## Benchmarks {#benchmarks} -Two peer-reviewed papers in 2015 confirmed that spaCy offers the **fastest -syntactic parser in the world** and that **its accuracy is within 1% of the -best** available. The few systems that are more accurate are 20× slower or more. +spaCy v3.0 introduces transformer-based pipelines that bring spaCy's accuracy +right up to **current state-of-the-art**. You can also use a CPU-optimized +pipeline, which is less accurate but much cheaper to run. -> #### About the evaluation + + +> #### Evaluation details > -> The first of the evaluations was published by **Yahoo! Labs** and **Emory -> University**, as part of a survey of current parsing technologies -> ([Choi et al., 2015](https://aclweb.org/anthology/P/P15/P15-1038.pdf)). Their -> results and subsequent discussions helped us develop a novel -> psychologically-motivated technique to improve spaCy's accuracy, which we -> published in joint work with Macquarie University -> ([Honnibal and Johnson, 2015](https://www.aclweb.org/anthology/D/D15/D15-1162.pdf)). +> - **OntoNotes 5.0:** spaCy's English models are trained on this corpus, as +> it's several times larger than other English treebanks. However, most +> systems do not report accuracies on it. +> - **Penn Treebank:** The "classic" parsing evaluation for research. However, +> it's quite far removed from actual usage: it uses sentences with +> gold-standard segmentation and tokenization, from a pretty specific type of +> text (articles from a single newspaper, 1984-1989). -import BenchmarksChoi from 'usage/\_benchmarks-choi.md' +import Benchmarks from 'usage/\_benchmarks-models.md' - + -### Algorithm comparison {#algorithm} + -In this section, we compare spaCy's algorithms to recently published systems, -using some of the most popular benchmarks. These benchmarks are designed to help -isolate the contributions of specific algorithmic decisions, so they promote -slightly "idealized" conditions. Specifically, the text comes pre-processed with -"gold standard" token and sentence boundaries. The data sets also tend to be -fairly small, to help researchers iterate quickly. These conditions mean the -models trained on these data sets are not always useful for practical purposes. + -#### Parse accuracy (Penn Treebank / Wall Street Journal) {#parse-accuracy-penn} +The easiest way to reproduce spaCy's benchmarks on the Penn Treebank is to clone +our project template. -This is the "classic" evaluation, so it's the number parsing researchers are -most easily able to put in context. However, it's quite far removed from actual -usage: it uses sentences with gold-standard segmentation and tokenization, from -a pretty specific type of text (articles from a single newspaper, 1984-1989). + -> #### Methodology -> -> [Andor et al. (2016)](http://arxiv.org/abs/1603.06042) chose slightly -> different experimental conditions from -> [Choi et al. (2015)](https://aclweb.org/anthology/P/P15/P15-1038.pdf), so the -> two accuracy tables here do not present directly comparable figures. + diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index ee5fd0a3b..170e16591 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -166,10 +166,9 @@ $ python setup.py build_ext --inplace # compile spaCy ``` Compared to regular install via pip, the -[`requirements.txt`](https://github.com/explosion/spaCy/tree/master/requirements.txt) -additionally installs developer dependencies such as Cython. See the -[quickstart widget](#quickstart) to get the right commands for your platform and -Python version. +[`requirements.txt`](%%GITHUB_SPACY/requirements.txt) additionally installs +developer dependencies such as Cython. See the [quickstart widget](#quickstart) +to get the right commands for your platform and Python version. #### Ubuntu {#source-ubuntu} @@ -195,16 +194,14 @@ that matches the version that was used to compile your Python interpreter. ### Run tests {#run-tests} -spaCy comes with an -[extensive test suite](https://github.com/explosion/spaCy/tree/master/spacy/tests). -In order to run the tests, you'll usually want to clone the -[repository](https://github.com/explosion/spaCy/tree/master/) and -[build spaCy from source](#source). This will also install the required +spaCy comes with an [extensive test suite](%%GITHUB_SPACY/spacy/tests). In order +to run the tests, you'll usually want to clone the [repository](%%GITHUB_SPACY) +and [build spaCy from source](#source). This will also install the required development dependencies and test utilities defined in the `requirements.txt`. Alternatively, you can find out where spaCy is installed and run `pytest` on that directory. Don't forget to also install the test utilities via spaCy's -[`requirements.txt`](https://github.com/explosion/spaCy/tree/master/requirements.txt): +[`requirements.txt`](%%GITHUB_SPACY/requirements.txt): ```bash $ python -c "import os; import spacy; print(os.path.dirname(spacy.__file__))" diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 6783f2b7f..aefc64ece 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -28,9 +28,9 @@ A **model architecture** is a function that wires up a neural network that is run internally as part of a component in a spaCy pipeline. To define the actual architecture, you can implement your logic in Thinc directly, or you can use Thinc as a thin wrapper around frameworks such as -PyTorch, TensorFlow and MXNet. Each Model can also be used as a sublayer of a +PyTorch, TensorFlow and MXNet. Each `Model` can also be used as a sublayer of a larger network, allowing you to freely combine implementations from different -frameworks into one `Thinc` Model. +frameworks into a single model. spaCy's built-in components require a `Model` instance to be passed to them via the config system. To change the model architecture of an existing component, @@ -253,7 +253,7 @@ torch_model = nn.Sequential( nn.ReLU(), nn.Dropout2d(dropout), nn.Softmax(dim=1) - ) +) ``` The resulting wrapped `Model` can be used as a **custom architecture** as such, @@ -264,9 +264,10 @@ larger network. This effectively means that you can easily wrap different components from different frameworks, and "glue" them together with Thinc: ```python -from thinc.api import chain, with_array +from thinc.api import chain, with_array, PyTorchWrapper from spacy.ml import CharacterEmbed +wrapped_pt_model = PyTorchWrapper(torch_model) char_embed = CharacterEmbed(width, embed_size, nM, nC) model = chain(char_embed, with_array(wrapped_pt_model)) ``` @@ -473,18 +474,17 @@ with Model.define_operators({">>": chain}): ## Create new trainable components {#components} - -![Diagram of a pipeline component with its model](../images/layers-architectures.svg) + diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 3cf6316c9..a229c18e9 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -854,24 +854,22 @@ The algorithm can be summarized as follows: **Global** and **language-specific** tokenizer data is supplied via the language -data in -[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang). The -tokenizer exceptions define special cases like "don't" in English, which needs -to be split into two tokens: `{ORTH: "do"}` and `{ORTH: "n't", NORM: "not"}`. -The prefixes, suffixes and infixes mostly define punctuation rules – for -example, when to split off periods (at the end of a sentence), and when to leave -tokens containing periods intact (abbreviations like "U.S."). +data in [`spacy/lang`](%%GITHUB_SPACY/spacy/lang). The tokenizer exceptions +define special cases like "don't" in English, which needs to be split into two +tokens: `{ORTH: "do"}` and `{ORTH: "n't", NORM: "not"}`. The prefixes, suffixes +and infixes mostly define punctuation rules – for example, when to split off +periods (at the end of a sentence), and when to leave tokens containing periods +intact (abbreviations like "U.S."). Tokenization rules that are specific to one language, but can be **generalized across that language** should ideally live in the language data in -[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang) – we -always appreciate pull requests! Anything that's specific to a domain or text -type – like financial trading abbreviations, or Bavarian youth slang – should be -added as a special case rule to your tokenizer instance. If you're dealing with -a lot of customizations, it might make sense to create an entirely custom -subclass. +[`spacy/lang`](%%GITHUB_SPACY/spacy/lang) – we always appreciate pull requests! +Anything that's specific to a domain or text type – like financial trading +abbreviations, or Bavarian youth slang – should be added as a special case rule +to your tokenizer instance. If you're dealing with a lot of customizations, it +might make sense to create an entirely custom subclass. @@ -1059,7 +1057,7 @@ but also detailed regular expressions that take the surrounding context into account. For example, there is a regular expression that treats a hyphen between letters as an infix. If you do not want the tokenizer to split on hyphens between letters, you can modify the existing infix definition from -[`lang/punctuation.py`](https://github.com/explosion/spaCy/blob/master/spacy/lang/punctuation.py): +[`lang/punctuation.py`](%%GITHUB_SPACY/spacy/lang/punctuation.py): ```python ### {executable="true"} @@ -1096,10 +1094,10 @@ print([t.text for t in doc]) # ['mother-in-law'] ``` For an overview of the default regular expressions, see -[`lang/punctuation.py`](https://github.com/explosion/spaCy/blob/master/spacy/lang/punctuation.py) -and language-specific definitions such as -[`lang/de/punctuation.py`](https://github.com/explosion/spaCy/blob/master/spacy/lang/de/punctuation.py) -for German. +[`lang/punctuation.py`](%%GITHUB_SPACY/spacy/lang/punctuation.py) and +language-specific definitions such as +[`lang/de/punctuation.py`](%%GITHUB_SPACY/spacy/lang/de/punctuation.py) for +German. ### Hooking a custom tokenizer into the pipeline {#custom-tokenizer} diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 9b1e96e4e..e94cdfe9e 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -76,7 +76,7 @@ spaCy also supports pipelines trained on more than one language. This is especially useful for named entity recognition. The language ID used for multi-language or language-neutral pipelines is `xx`. The language class, a generic subclass containing only the base language data, can be found in -[`lang/xx`](https://github.com/explosion/spaCy/tree/master/spacy/lang/xx). +[`lang/xx`](%%GITHUB_SPACY/spacy/lang/xx). To train a pipeline using the neutral multi-language class, you can set `lang = "xx"` in your [training config](/usage/training#config). You can also diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 137c697b8..81ddf40fb 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -728,18 +728,21 @@ workflows, but only one can be tracked by DVC.
- + --- ### Prodigy {#prodigy} + + +The Prodigy integration will require a nightly version of Prodigy that supports +spaCy v3+. + + + [Prodigy](https://prodi.gy) is a modern annotation tool for creating training data for machine learning models, developed by us. It integrates with spaCy out-of-the-box and provides many different @@ -795,9 +798,7 @@ results. -Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum -sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat -mattis pretium. + @@ -805,10 +806,6 @@ mattis pretium. ### Streamlit {#streamlit} - - -
- [Streamlit](https://streamlit.io) is a Python framework for building interactive data apps. The [`spacy-streamlit`](https://github.com/explosion/spacy-streamlit) package helps you integrate spaCy visualizations into your Streamlit apps and @@ -817,16 +814,14 @@ full embedded visualizer, as well as individual components. -```bash -$ pip install "spacy_streamlit>=1.0.0a0" -``` - -
+> #### Installation +> +> ```bash +> $ pip install "spacy_streamlit>=1.0.0a0" +> ``` ![](../images/spacy-streamlit.png) -
- Using [`spacy-streamlit`](https://github.com/explosion/spacy-streamlit), your projects can easily define their own scripts that spin up an interactive visualizer, using the latest pipeline you trained, or a selection of pipelines @@ -917,10 +912,43 @@ https://github.com/explosion/projects/blob/v3/integrations/fastapi/scripts/main. ### Ray {#ray} + + + --- ### Weights & Biases {#wandb} - +[Weights & Biases](https://www.wandb.com/) is a popular platform for experiment +tracking. spaCy integrates with it out-of-the-box via the +[`WandbLogger`](/api/top-level#WandbLogger), which you can add as the +`[training.logger]` block of your training [config](/usage/training#config). The +results of each step are then logged in your project, together with the full +**training config**. This means that _every_ hyperparameter, registered function +name and argument will be tracked and you'll be able to see the impact it has on +your results. + +> #### Example config +> +> ```ini +> [training.logger] +> @loggers = "spacy.WandbLogger.v1" +> project_name = "monitor_spacy_training" +> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"] +> ``` + +![Screenshot: Visualized training results](../images/wandb1.jpg) + +![Screenshot: Parameter importance using config values](../images/wandb2.jpg 'Parameter importance using config values') + + + +Get started with tracking your spaCy training runs in Weights & Biases using our +project template. It includes a simple config using the `WandbLogger`, as well +as a custom logger implementation you can adjust for your specific use case. + + + + diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 01d60ddb8..2d6159f3d 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -192,12 +192,11 @@ of [`Token`](/api/token). This means that all of the attributes that refer to computed properties can't be accessed. The uppercase attribute names like `LOWER` or `IS_PUNCT` refer to symbols from -the -[`spacy.attrs`](https://github.com/explosion/spaCy/tree/master/spacy/attrs.pyx) -enum table. They're passed into a function that essentially is a big case/switch -statement, to figure out which struct field to return. The same attribute -identifiers are used in [`Doc.to_array`](/api/doc#to_array), and a few other -places in the code where you need to describe fields like this. +the [`spacy.attrs`](%%GITHUB_SPACY/spacy/attrs.pyx) enum table. They're passed +into a function that essentially is a big case/switch statement, to figure out +which struct field to return. The same attribute identifiers are used in +[`Doc.to_array`](/api/doc#to_array), and a few other places in the code where +you need to describe fields like this. diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index 9955e7d84..c0fe1323c 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -187,11 +187,11 @@ add to that data and saves and loads the data to and from a JSON file. > > To see custom serialization methods in action, check out the new > [`EntityRuler`](/api/entityruler) component and its -> [source](https://github.com/explosion/spaCy/tree/master/spacy/pipeline/entityruler.py). -> Patterns added to the component will be saved to a `.jsonl` file if the -> pipeline is serialized to disk, and to a bytestring if the pipeline is -> serialized to bytes. This allows saving out a pipeline with a rule-based -> entity recognizer and including all rules _with_ the component data. +> [source](%%GITHUB_SPACY/spacy/pipeline/entityruler.py). Patterns added to the +> component will be saved to a `.jsonl` file if the pipeline is serialized to +> disk, and to a bytestring if the pipeline is serialized to bytes. This allows +> saving out a pipeline with a rule-based entity recognizer and including all +> rules _with_ the component data. ```python ### {highlight="14-18,20-25"} diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index 82fec4b6a..8a98f8a1b 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -494,7 +494,7 @@ regressions to the parts of the library that you care about the most. **For more details on the types of contributions we're looking for, the code conventions and other useful tips, make sure to check out the -[contributing guidelines](https://github.com/explosion/spaCy/tree/master/CONTRIBUTING.md).** +[contributing guidelines](%%GITHUB_SPACY/CONTRIBUTING.md).** diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 51e56f2d5..4b25d1c21 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -59,7 +59,7 @@ specific use case. It's also available in spaCy as the import QuickstartTraining from 'widgets/quickstart-training.js' - + After you've saved the starter config to a file `base_config.cfg`, you can use the [`init fill-config`](/api/cli#init-fill-config) command to fill in the @@ -127,7 +127,7 @@ Some of the main advantages and features of spaCy's training config are: config which types of data to expect. ```ini -https://github.com/explosion/spaCy/blob/develop/spacy/default_config.cfg +%%GITHUB_SPACY/spacy/default_config.cfg ``` Under the hood, the config is parsed into a dictionary. It's divided into diff --git a/website/docs/usage/v2.md b/website/docs/usage/v2.md index f7bcc17d3..aee3c24a6 100644 --- a/website/docs/usage/v2.md +++ b/website/docs/usage/v2.md @@ -76,9 +76,7 @@ noise contrastive estimation or reinforcement learning. ## New features {#features} This section contains an overview of the most important **new features and -improvements**. The [API docs](/api) include additional deprecation notes. New -methods and functions that were introduced in this version are marked with the -tag 2. +improvements**. The [API docs](/api) include additional deprecation notes. ### Convolutional neural network models {#features-models} diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 3cbccc352..791b641df 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -8,20 +8,30 @@ menu: - ['Migrating from v2.x', 'migrating'] --- -## Summary {#summary} +## Summary {#summary hidden="true"} - +
+spaCy v3.0 features all new **transformer-based pipelines** that bring spaCy's +accuracy right up to the current **state-of-the-art**. You can use any +pretrained transformer to train your own pipelines, and even share one +transformer between multiple components with **multi-task learning**. Training +is now fully configurable and extensible, and you can define your own custom +models using **PyTorch**, **TensorFlow** and other frameworks. The new spaCy +projects system lets you describe whole **end-to-end workflows** in a single +file, giving you an easy path from prototype to production, and making it easy +to clone and adapt best-practice projects for your own use cases. +
- [Summary](#summary) - [New features](#features) -- [Training & config system](#features-training) - [Transformer-based pipelines](#features-transformers) +- [Training & config system](#features-training) - [Custom models](#features-custom-models) - [End-to-end project workflows](#features-projects) - [New built-in components](#features-pipeline-components) @@ -39,47 +49,126 @@ menu: ## New Features {#features} -### New training workflow and config system {#features-training} - - - -- **Usage:** [Training pipelines and models](/usage/training) -- **Thinc:** [Thinc's config system](https://thinc.ai/docs/usage-config), - [`Config`](https://thinc.ai/docs/api-config#config) -- **CLI:** [`train`](/api/cli#train), [`pretrain`](/api/cli#pretrain), - [`evaluate`](/api/cli#evaluate) -- **API:** [Config format](/api/data-formats#config), - [`registry`](/api/top-level#registry) - - +This section contains an overview of the most important **new features and +improvements**. The [API docs](/api) include additional deprecation notes. New +methods and functions that were introduced in this version are marked with the +tag 3. ### Transformer-based pipelines {#features-transformers} +> #### Example +> +> ```cli +> $ python -m spacy download en_core_web_trf +> ``` + +spaCy v3.0 features all new transformer-based pipelines that bring spaCy's +accuracy right up to the current **state-of-the-art**. You can use any +pretrained transformer to train your own pipelines, and even share one +transformer between multiple components with **multi-task learning**. spaCy's +transformer support interoperates with [PyTorch](https://pytorch.org) and the +[HuggingFace `transformers`](https://huggingface.co/transformers/) library, +giving you access to thousands of pretrained models for your pipelines. + ![Pipeline components listening to shared embedding component](../images/tok2vec-listener.svg) +import Benchmarks from 'usage/\_benchmarks-models.md' + + + - **Usage:** [Embeddings & Transformers](/usage/embeddings-transformers), - [Training pipelines and models](/usage/training) + [Training pipelines and models](/usage/training), + [Benchmarks](/usage/facts-figures#benchmarks) - **API:** [`Transformer`](/api/transformer), [`TransformerData`](/api/transformer#transformerdata), [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) - **Architectures: ** [TransformerModel](/api/architectures#TransformerModel), [TransformerListener](/api/architectures#TransformerListener), [Tok2VecTransformer](/api/architectures#Tok2VecTransformer) -- **Trained Pipelines:** [`en_core_trf_lg_sm`](/models/en) +- **Trained Pipelines:** [`en_core_web_trf`](/models/en#en_core_web_trf) - **Implementation:** [`spacy-transformers`](https://github.com/explosion/spacy-transformers) +### New training workflow and config system {#features-training} + +> #### Example +> +> ```ini +> [training] +> vectors = null +> accumulate_gradient = 3 +> +> [training.optimizer] +> @optimizers = "Adam.v1" +> +> [training.optimizer.learn_rate] +> @schedules = "warmup_linear.v1" +> warmup_steps = 250 +> total_steps = 20000 +> initial_rate = 0.01 +> ``` + +spaCy v3.0 introduces a comprehensive and extensible system for **configuring +your training runs**. A single configuration file describes every detail of your +training run, with no hidden defaults, making it easy to rerun your experiments +and track changes. You can use the +[quickstart widget](/usage/training#quickstart) or the `init config` command to +get started. Instead of providing lots of arguments on the command line, you +only need to pass your `config.cfg` file to `spacy train`. + +Training config files include all **settings and hyperparameters** for training +your pipeline. Some settings can also be registered **functions** that you can +swap out and customize, making it easy to implement your own custom models and +architectures. + + + +- **Usage:** [Training pipelines and models](/usage/training) +- **Thinc:** [Thinc's config system](https://thinc.ai/docs/usage-config), + [`Config`](https://thinc.ai/docs/api-config#config) +- **CLI:** [`init config`](/api/cli#init-config), + [`init fill-config`](/api/cli#init-fill-config), [`train`](/api/cli#train), + [`pretrain`](/api/cli#pretrain), [`evaluate`](/api/cli#evaluate) +- **API:** [Config format](/api/data-formats#config), + [`registry`](/api/top-level#registry) + + + ### Custom models using any framework {#features-custom-models} +> #### Example +> +> ```python +> from torch import nn +> from thinc.api import PyTorchWrapper +> +> torch_model = nn.Sequential( +> nn.Linear(32, 32), +> nn.ReLU(), +> nn.Softmax(dim=1) +> ) +> model = PyTorchWrapper(torch_model) +> ``` + +spaCy's new configuration system makes it easy to customize the neural network +models used by the different pipeline components. You can also implement your +own architectures via spaCy's machine learning library [Thinc](https://thinc.ai) +that provides various layers and utilities, as well as thin wrappers around +frameworks like **PyTorch**, **TensorFlow** and **MXNet**. Component models all +follow the same unified [`Model`](https://thinc.ai/docs/api-model) API and each +`Model` can also be used as a sublayer of a larger network, allowing you to +freely combine implementations from different frameworks into a single model. + - **Usage: ** [Layers and architectures](/usage/layers-architectures) - **Thinc: ** - [Wrapping PyTorch, TensorFlow & MXNet](https://thinc.ai/docs/usage-frameworks) + [Wrapping PyTorch, TensorFlow & MXNet](https://thinc.ai/docs/usage-frameworks), + [`Model` API](https://thinc.ai/docs/api-model) - **API:** [Model architectures](/api/architectures), [`Pipe`](/api/pipe) @@ -159,8 +248,7 @@ add to your pipeline and customize for your use case: - **Usage:** [Processing pipelines](/usage/processing-pipelines) - **API:** [Built-in pipeline components](/api#architecture-pipeline) -- **Implementation:** - [`spacy/pipeline`](https://github.com/explosion/spaCy/tree/develop/spacy/pipeline) +- **Implementation:** [`spacy/pipeline`](%%GITHUB_SPACY/spacy/pipeline) @@ -197,15 +285,12 @@ aren't set. [`@Language.factory`](/api/language#factory), [`Language.add_pipe`](/api/language#add_pipe), [`Language.analyze_pipes`](/api/language#analyze_pipes) -- **Implementation:** - [`spacy/language.py`](https://github.com/explosion/spaCy/tree/develop/spacy/language.py) +- **Implementation:** [`spacy/language.py`](%%GITHUB_SPACY/spacy/language.py)
### Dependency matching {#features-dep-matcher} - - > #### Example > > ```python @@ -233,7 +318,7 @@ dictionaries**, with each dictionary describing a **token to match** and its [Dependency matching](/usage/rule-based-matching#dependencymatcher), - **API:** [`DependencyMatcher`](/api/dependencymatcher), - **Implementation:** - [`spacy/matcher/dependencymatcher.pyx`](https://github.com/explosion/spaCy/tree/develop/spacy/matcher/dependencymatcher.pyx) + [`spacy/matcher/dependencymatcher.pyx`](%%GITHUB_SPACY/spacy/matcher/dependencymatcher.pyx) @@ -404,11 +489,12 @@ Note that spaCy v3.0 now requires **Python 3.6+**. [`Pipe.begin_training`](/api/pipe#begin_training) now take a function that returns a sequence of `Example` objects to initialize the model instead of a list of tuples. -- [`Matcher.add`](/api/matcher#add), - [`PhraseMatcher.add`](/api/phrasematcher#add) and - [`DependencyMatcher.add`](/api/dependencymatcher#add) now only accept a list - of patterns as the second argument (instead of a variable number of - arguments). The `on_match` callback becomes an optional keyword argument. +- [`Matcher.add`](/api/matcher#add) and + [`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of + patterns as the second argument (instead of a variable number of arguments). + The `on_match` callback becomes an optional keyword argument. +- The `spacy.gold` module has been renamed to + [`spacy.training`](%%GITHUB_SPACY/spacy/training). - The `PRON_LEMMA` symbol and `-PRON-` as an indicator for pronoun lemmas has been removed. - The `TAG_MAP` and `MORPH_RULES` in the language data have been replaced by the @@ -779,6 +865,20 @@ python -m spacy package ./output ./packages - python setup.py sdist ``` +#### Data utilities and gold module {#migrating-gold} + +The `spacy.gold` module has been renamed to `spacy.training`. This mostly +affects internals, but if you've been using the span offset conversion utilities +[`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets), +[`offsets_from_biluo_tags`](/api/top-level#offsets_from_biluo_tags) or +[`spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags), you'll have to +change your imports: + +```diff +- from spacy.gold import biluo_tags_from_offsets, spans_from_biluo_tags ++ from spacy.training import biluo_tags_from_offsets, spans_from_biluo_tags +``` + #### Migration notes for plugin maintainers {#migrating-plugins} Thanks to everyone who's been contributing to the spaCy ecosystem by developing diff --git a/website/gatsby-config.js b/website/gatsby-config.js index 6c67de6ea..78fdc336f 100644 --- a/website/gatsby-config.js +++ b/website/gatsby-config.js @@ -8,7 +8,6 @@ const codeBlocksPlugin = require('./src/plugins/remark-code-blocks.js') // Import metadata const site = require('./meta/site.json') -const logos = require('./meta/logos.json') const sidebars = require('./meta/sidebars.json') const models = require('./meta/languages.json') const universe = require('./meta/universe.json') @@ -20,11 +19,16 @@ const favicon = isNightly ? `src/images/icon_nightly.png` : `src/images/icon.png const binderBranch = isNightly ? 'nightly' : site.binderBranch const siteUrl = isNightly ? site.siteUrlNightly : site.siteUrl const domain = isNightly ? site.domainNightly : site.domain +const branch = isNightly ? 'develop' : 'master' + +// Those variables are going to be replaced in the Markdown, e.g. %%GITHUB_SPACY +const replacements = { + GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`, +} module.exports = { siteMetadata: { ...site, - ...logos, sidebars, ...models, universe, @@ -121,6 +125,13 @@ module.exports = { { resolve: `gatsby-remark-copy-linked-files`, }, + { + resolve: 'gatsby-remark-find-replace', + options: { + replacements, + prefix: '%%', + }, + }, ], }, }, diff --git a/website/meta/logos.json b/website/meta/logos.json deleted file mode 100644 index 783995026..000000000 --- a/website/meta/logos.json +++ /dev/null @@ -1,37 +0,0 @@ -{ - "logosUsers": [ - { "id": "airbnb", "url": "https://www.airbnb.com" }, - { "id": "uber", "url": "https://www.uber.com" }, - { "id": "quora", "url": "https://www.quora.com" }, - { "id": "retriever", "url": "https://www.retriever.no" }, - { "id": "stitchfix", "url": "https://www.stitchfix.com" }, - { "id": "chartbeat", "url": "https://chartbeat.com" }, - { "id": "allenai", "url": "https://allenai.org" } - ], - "logosPublications": [ - { - "id": "recode", - "url": "https://www.recode.net/2017/6/22/15855492/ai-artificial-intelligence-nonprofit-good-human-chatbots-machine-learning" - }, - { - "id": "wapo", - "url": "https://www.washingtonpost.com/news/wonk/wp/2016/05/18/googles-new-artificial-intelligence-cant-understand-these-sentences-can-you/" - }, - { - "id": "bbc", - "url": "http://www.bbc.co.uk/rd/blog/2017-08-irfs-weeknotes-number-250" - }, - { - "id": "microsoft", - "url": "https://www.microsoft.com/developerblog/2016/09/13/training-a-classifier-for-relation-extraction-from-medical-literature/" - }, - { - "id": "venturebeat", - "url": "https://venturebeat.com/2017/01/27/4-ai-startups-that-analyze-customer-reviews/" - }, - { - "id": "thoughtworks", - "url": "https://www.thoughtworks.com/radar/tools" - } - ] -} diff --git a/website/meta/site.json b/website/meta/site.json index d1162edf9..1955932b9 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -28,7 +28,7 @@ }, "binderUrl": "explosion/spacy-io-binder", "binderBranch": "live", - "binderVersion": "2.3.0", + "binderVersion": "3.0.0", "sections": [ { "id": "usage", "title": "Usage Documentation", "theme": "blue" }, { "id": "models", "title": "Models Documentation", "theme": "blue" }, @@ -47,20 +47,19 @@ "items": [ { "text": "Usage", "url": "/usage" }, { "text": "Models", "url": "/models" }, - { "text": "API", "url": "/api" }, - { "text": "Universe", "url": "/universe" } + { "text": "API Reference", "url": "/api" }, + { "text": "Online Course", "url": "https://course.spacy.io" } ] }, { - "label": "Support", + "label": "Community", "items": [ + { "text": "Universe", "url": "/universe" }, { "text": "Issue Tracker", "url": "https://github.com/explosion/spaCy/issues" }, { "text": "Stack Overflow", "url": "http://stackoverflow.com/questions/tagged/spacy" - }, - { "text": "Reddit User Group", "url": "https://www.reddit.com/r/spacynlp/" }, - { "text": "Gitter Chat", "url": "https://gitter.im/explosion/spaCy" } + } ] }, { diff --git a/website/package-lock.json b/website/package-lock.json index d995f910e..d8444c2b2 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -14238,6 +14238,46 @@ } } }, + "gatsby-remark-find-replace": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/gatsby-remark-find-replace/-/gatsby-remark-find-replace-0.3.0.tgz", + "integrity": "sha512-tTXt+ZxD+7hEVtZVbZVrifcQUk2mt4uJNUHhc9cje+93sDa4PrrFBbny9IWgXLj9QH9xDxWOZrI768ApMtbPUQ==", + "requires": { + "escape-string-regexp": "^2.0.0", + "unist-util-visit": "^2.0.1" + }, + "dependencies": { + "escape-string-regexp": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-2.0.0.tgz", + "integrity": "sha512-UpzcLCXolUWcNu5HtVMHYdXJjArjsF9C0aNnquZYY4uW/Vu0miy5YoWvbV345HauVvcAUnpRuhMMcqTcGOY2+w==" + }, + "unist-util-is": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/unist-util-is/-/unist-util-is-4.0.2.tgz", + "integrity": "sha512-Ofx8uf6haexJwI1gxWMGg6I/dLnF2yE+KibhD3/diOqY2TinLcqHXCV6OI5gFVn3xQqDH+u0M625pfKwIwgBKQ==" + }, + "unist-util-visit": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/unist-util-visit/-/unist-util-visit-2.0.3.tgz", + "integrity": "sha512-iJ4/RczbJMkD0712mGktuGpm/U4By4FfDonL7N/9tATGIF4imikjOuagyMY53tnZq3NP6BcmlrHhEKAfGWjh7Q==", + "requires": { + "@types/unist": "^2.0.0", + "unist-util-is": "^4.0.0", + "unist-util-visit-parents": "^3.0.0" + } + }, + "unist-util-visit-parents": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/unist-util-visit-parents/-/unist-util-visit-parents-3.1.0.tgz", + "integrity": "sha512-0g4wbluTF93npyPrp/ymd3tCDTMnP0yo2akFD2FIBAYXq/Sga3lwaU1D8OYKbtpioaI6CkDcQ6fsMnmtzt7htw==", + "requires": { + "@types/unist": "^2.0.0", + "unist-util-is": "^4.0.0" + } + } + } + }, "gatsby-remark-images": { "version": "3.0.4", "resolved": "https://registry.npmjs.org/gatsby-remark-images/-/gatsby-remark-images-3.0.4.tgz", @@ -22152,6 +22192,14 @@ "clipboard": "^2.0.0" } }, + "prismjs-bibtex": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/prismjs-bibtex/-/prismjs-bibtex-1.1.0.tgz", + "integrity": "sha512-IjZUJP3iTkV1DZ8qcjUF7p7Ji/LPns56jw+INUBPtnBaX4Q/VhtzlRGHM0lxSvdfqUvqgTGF3oM8aogWqzZz2g==", + "requires": { + "prismjs": "^1.15" + } + }, "private": { "version": "0.1.8", "resolved": "https://registry.npmjs.org/private/-/private-0.1.8.tgz", diff --git a/website/package.json b/website/package.json index 40018f532..def94a9c2 100644 --- a/website/package.json +++ b/website/package.json @@ -31,6 +31,7 @@ "gatsby-plugin-sitemap": "^2.0.5", "gatsby-plugin-svgr": "^2.0.1", "gatsby-remark-copy-linked-files": "^2.0.9", + "gatsby-remark-find-replace": "^0.3.0", "gatsby-remark-images": "^3.0.4", "gatsby-remark-prismjs": "^3.2.4", "gatsby-remark-smartypants": "^2.0.8", @@ -44,6 +45,7 @@ "node-sass": "^4.11.0", "parse-numeric-range": "0.0.2", "prismjs": "^1.15.0", + "prismjs-bibtex": "^1.1.0", "prop-types": "^15.7.2", "react": "^16.8.2", "react-dom": "^16.8.2", diff --git a/website/src/components/code.js b/website/src/components/code.js index f075539ea..5a7828a33 100644 --- a/website/src/components/code.js +++ b/website/src/components/code.js @@ -2,6 +2,7 @@ import React, { Fragment } from 'react' import PropTypes from 'prop-types' import classNames from 'classnames' import highlightCode from 'gatsby-remark-prismjs/highlight-code.js' +import 'prismjs-bibtex' import rangeParser from 'parse-numeric-range' import { StaticQuery, graphql } from 'gatsby' import { window } from 'browser-monads' diff --git a/website/src/components/grid.js b/website/src/components/grid.js index 1d11a748f..299fcf931 100644 --- a/website/src/components/grid.js +++ b/website/src/components/grid.js @@ -9,6 +9,7 @@ export default function Grid({ narrow = false, gutterBottom = true, className, + style, children, }) { const gridClassNames = classNames(classes.root, className, { @@ -18,7 +19,11 @@ export default function Grid({ [classes.third]: cols === 3, [classes.quarter]: cols === 4, }) - return
{children}
+ return ( +
+ {children} +
+ ) } Grid.propTypes = { diff --git a/website/src/components/infobox.js b/website/src/components/infobox.js index 363638bf2..968b6cea8 100644 --- a/website/src/components/infobox.js +++ b/website/src/components/infobox.js @@ -23,7 +23,7 @@ export default function Infobox({