mirror of https://github.com/explosion/spaCy.git
Update docs [ci skip]
This commit is contained in:
parent
c044460823
commit
d611cbef43
|
@ -8,9 +8,9 @@ api_string_name: tok2vec
|
|||
api_trainable: true
|
||||
---
|
||||
|
||||
Apply a "token-to-vector" model and set its outputs in the doc.tensor attribute.
|
||||
This is mostly useful to **share a single subnetwork** between multiple
|
||||
components, e.g. to have one embedding and CNN network shared between a
|
||||
Apply a "token-to-vector" model and set its outputs in the `Doc.tensor`
|
||||
attribute. This is mostly useful to **share a single subnetwork** between
|
||||
multiple components, e.g. to have one embedding and CNN network shared between a
|
||||
[`DependencyParser`](/api/dependencyparser), [`Tagger`](/api/tagger) and
|
||||
[`EntityRecognizer`](/api/entityrecognizer).
|
||||
|
||||
|
|
|
@ -25,12 +25,12 @@ work out-of-the-box.
|
|||
|
||||
</Infobox>
|
||||
|
||||
This pipeline component lets you use transformer models in your pipeline, using
|
||||
the [HuggingFace `transformers`](https://huggingface.co/transformers) library
|
||||
under the hood. Usually you will connect subsequent components to the shared
|
||||
transformer using the
|
||||
[TransformerListener](/api/architectures#TransformerListener) layer. This works
|
||||
similarly to spaCy's [Tok2Vec](/api/tok2vec) component and
|
||||
This pipeline component lets you use transformer models in your pipeline.
|
||||
Supports all models that are available via the
|
||||
[HuggingFace `transformers`](https://huggingface.co/transformers) library.
|
||||
Usually you will connect subsequent components to the shared transformer using
|
||||
the [TransformerListener](/api/architectures#TransformerListener) layer. This
|
||||
works similarly to spaCy's [Tok2Vec](/api/tok2vec) component and
|
||||
[Tok2VecListener](/api/architectures/Tok2VecListener) sublayer.
|
||||
|
||||
The component assigns the output of the transformer to the `Doc`'s extension
|
||||
|
@ -419,7 +419,7 @@ Split a `TransformerData` object that represents a batch into a list with one
|
|||
| ----------- | ----------------------- | ----------- |
|
||||
| **RETURNS** | `List[TransformerData]` | |
|
||||
|
||||
## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
|
||||
## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
|
||||
|
||||
Span getters are functions that take a batch of [`Doc`](/api/doc) objects and
|
||||
return a lists of [`Span`](/api/span) objects for each doc, to be processed by
|
||||
|
@ -427,15 +427,15 @@ the transformer. This is used to manage long documents, by cutting them into
|
|||
smaller sequences before running the transformer. The spans are allowed to
|
||||
overlap, and you can also omit sections of the Doc if they are not relevant.
|
||||
|
||||
Span getters can be referenced in the config's
|
||||
`[components.transformer.model.get_spans]` block to customize the sequences
|
||||
processed by the transformer. You can also register custom span getters using
|
||||
the `@registry.span_getters` decorator.
|
||||
Span getters can be referenced in the `[components.transformer.model.get_spans]`
|
||||
block of the config to customize the sequences processed by the transformer. You
|
||||
can also register custom span getters using the `@spacy.registry.span_getters`
|
||||
decorator.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> @registry.span_getters("sent_spans.v1")
|
||||
> @spacy.registry.span_getters("sent_spans.v1")
|
||||
> def configure_get_sent_spans() -> Callable:
|
||||
> def get_sent_spans(docs: Iterable[Doc]) -> List[List[Span]]:
|
||||
> return [list(doc.sents) for doc in docs]
|
||||
|
@ -448,15 +448,55 @@ the `@registry.span_getters` decorator.
|
|||
| `docs` | `Iterable[Doc]` | A batch of `Doc` objects. |
|
||||
| **RETURNS** | `List[List[Span]]` | The spans to process by the transformer. |
|
||||
|
||||
The following built-in functions are available:
|
||||
### doc_spans.v1 {#doc_spans tag="registered function"}
|
||||
|
||||
<!-- TODO: finish API docs -->
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [transformer.model.get_spans]
|
||||
> @span_getters = "doc_spans.v1"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------ | ------------------------------------------------------------------ |
|
||||
| `doc_spans.v1` | Create a span for each doc (no transformation, process each text). |
|
||||
| `sent_spans.v1` | Create a span for each sentence if sentence boundaries are set. |
|
||||
| `strided_spans.v1` | |
|
||||
Create a span getter that uses the whole document as its spans. This is the best
|
||||
approach if your [`Doc`](/api/doc) objects already refer to relatively short
|
||||
texts.
|
||||
|
||||
### sent_spans.v1 {#sent_spans tag="registered function"}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [transformer.model.get_spans]
|
||||
> @span_getters = "sent_spans.v1"
|
||||
> ```
|
||||
|
||||
Create a span getter that uses sentence boundary markers to extract the spans.
|
||||
This requires sentence boundaries to be set (e.g. by the
|
||||
[`Sentencizer`](/api/sentencizer)), and may result in somewhat uneven batches,
|
||||
depending on the sentence lengths. However, it does provide the transformer with
|
||||
more meaningful windows to attend over.
|
||||
|
||||
### strided_spans.v1 {#strided_spans tag="registered function"}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [transformer.model.get_spans]
|
||||
> @span_getters = "strided_spans.v1"
|
||||
> window = 128
|
||||
> stride = 96
|
||||
> ```
|
||||
|
||||
Create a span getter for strided spans. If you set the `window` and `stride` to
|
||||
the same value, the spans will cover each token once. Setting `stride` lower
|
||||
than `window` will allow for an overlap, so that some tokens are counted twice.
|
||||
This can be desirable, because it allows all tokens to have both a left and
|
||||
right context.
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ---- | ---------------- |
|
||||
| `window` | int | The window size. |
|
||||
| `stride` | int | The stride size. |
|
||||
|
||||
## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}
|
||||
|
||||
|
|
|
@ -1,54 +1,88 @@
|
|||
The central data structures in spaCy are the `Doc` and the `Vocab`. The `Doc`
|
||||
object owns the **sequence of tokens** and all their annotations. The `Vocab`
|
||||
object owns a set of **look-up tables** that make common information available
|
||||
across documents. By centralizing strings, word vectors and lexical attributes,
|
||||
we avoid storing multiple copies of this data. This saves memory, and ensures
|
||||
there's a **single source of truth**.
|
||||
The central data structures in spaCy are the [`Language`](/api/language) class,
|
||||
the [`Vocab`](/api/vocab) and the [`Doc`](/api/doc) object. The `Language` class
|
||||
is used to process a text and turn it into a `Doc` object. It's typically stored
|
||||
as a variable called `nlp`. The `Doc` object owns the **sequence of tokens** and
|
||||
all their annotations. By centralizing strings, word vectors and lexical
|
||||
attributes in the `Vocab`, we avoid storing multiple copies of this data. This
|
||||
saves memory, and ensures there's a **single source of truth**.
|
||||
|
||||
Text annotations are also designed to allow a single source of truth: the `Doc`
|
||||
object owns the data, and `Span` and `Token` are **views that point into it**.
|
||||
The `Doc` object is constructed by the `Tokenizer`, and then **modified in
|
||||
place** by the components of the pipeline. The `Language` object coordinates
|
||||
these components. It takes raw text and sends it through the pipeline, returning
|
||||
an **annotated document**. It also orchestrates training and serialization.
|
||||
object owns the data, and [`Span`](/api/span) and [`Token`](/api/token) are
|
||||
**views that point into it**. The `Doc` object is constructed by the
|
||||
[`Tokenizer`](/api/tokenizer), and then **modified in place** by the components
|
||||
of the pipeline. The `Language` object coordinates these components. It takes
|
||||
raw text and sends it through the pipeline, returning an **annotated document**.
|
||||
It also orchestrates training and serialization.
|
||||
|
||||
<!-- TODO: update architecture and tables below to match sidebar in API docs etc. -->
|
||||
<!-- TODO: update graphic -->
|
||||
|
||||
![Library architecture](../../images/architecture.svg)
|
||||
|
||||
### Container objects {#architecture-containers}
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| [`Doc`](/api/doc) | A container for accessing linguistic annotations. |
|
||||
| [`Span`](/api/span) | A slice from a `Doc` object. |
|
||||
| [`Token`](/api/token) | An individual token — i.e. a word, punctuation symbol, whitespace, etc. |
|
||||
| [`Lexeme`](/api/lexeme) | An entry in the vocabulary. It's a word type with no context, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse etc. |
|
||||
| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis. |
|
||||
| Name | Description |
|
||||
| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| [`Language`](/api/language) | Processing class that turns text into `Doc` objects. Different languages implement their own subclasses of it. The variable is typically called `nlp`. |
|
||||
| [`Doc`](/api/doc) | A container for accessing linguistic annotations. |
|
||||
| [`Span`](/api/span) | A slice from a `Doc` object. |
|
||||
| [`Token`](/api/token) | An individual token — i.e. a word, punctuation symbol, whitespace, etc. |
|
||||
| [`Lexeme`](/api/lexeme) | An entry in the vocabulary. It's a word type with no context, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse etc. |
|
||||
| [`Example`](/api/example) | A collection of training annotations, containing two `Doc` objects: the reference data and the predictions. |
|
||||
| [`DocBin`](/api/docbin) | A collection of `Doc` objects for efficient binary serialization. Also used for [training data](/api/data-formats#binary-training). |
|
||||
|
||||
### Processing pipeline {#architecture-pipeline}
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||
| [`Language`](/api/language) | A text-processing pipeline. Usually you'll load this once per process as `nlp` and pass the instance around your application. |
|
||||
| [`Tokenizer`](/api/tokenizer) | Segment text, and create `Doc` objects with the discovered segment boundaries. |
|
||||
| [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words. |
|
||||
| [`Morphology`](/api/morphology) | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. |
|
||||
| [`Tagger`](/api/tagger) | Annotate part-of-speech tags on `Doc` objects. |
|
||||
| [`DependencyParser`](/api/dependencyparser) | Annotate syntactic dependencies on `Doc` objects. |
|
||||
| [`EntityRecognizer`](/api/entityrecognizer) | Annotate named entities, e.g. persons or products, on `Doc` objects. |
|
||||
| [`TextCategorizer`](/api/textcategorizer) | Assign categories or labels to `Doc` objects. |
|
||||
| [`Matcher`](/api/matcher) | Match sequences of tokens, based on pattern rules, similar to regular expressions. |
|
||||
| [`PhraseMatcher`](/api/phrasematcher) | Match sequences of tokens based on phrases. |
|
||||
| [`EntityRuler`](/api/entityruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. |
|
||||
| [`Sentencizer`](/api/sentencizer) | Implement custom sentence boundary detection logic that doesn't require the dependency parse. |
|
||||
| [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. |
|
||||
The processing pipeline consists of one or more **pipeline components** that are
|
||||
called on the `Doc` in order. The tokenizer runs before the components. Pipeline
|
||||
components can be added using [`Language.add_pipe`](/api/language#add_pipe).
|
||||
They can contain a statistical model and trained weights, or only make
|
||||
rule-based modifications to the `Doc`. spaCy provides a range of built-in
|
||||
components for different language processing tasks and also allows adding
|
||||
[custom components](/usage/processing-pipelines#custom-components).
|
||||
|
||||
![The processing pipeline](../../images/pipeline.svg)
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------------------------------- | ------------------------------------------------------------------------------------------- |
|
||||
| [`Tokenizer`](/api/tokenizer) | Segment raw text and create `Doc` objects from the words. |
|
||||
| [`Tok2Vec`](/api/tok2vec) | Apply a "token-to-vector" model and set its outputs. |
|
||||
| [`Transformer`](/api/transformer) | Use a transformer model and set its outputs. |
|
||||
| [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words. |
|
||||
| [`Morphologizer`](/api/morphologizer) | Predict morphological features and coarse-grained part-of-speech tags. |
|
||||
| [`Tagger`](/api/tagger) | Predict part-of-speech tags. |
|
||||
| [`AttributeRuler`](/api/attributeruler) | Set token attributes using matcher rules. |
|
||||
| [`DependencyParser`](/api/dependencyparser) | Predict syntactic dependencies. |
|
||||
| [`EntityRecognizer`](/api/entityrecognizer) | Predict named entities, e.g. persons or products. |
|
||||
| [`EntityRuler`](/api/entityruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. |
|
||||
| [`EntityLinker`](/api/entitylinker) | Disambiguate named entities to nodes in a knowledge base. |
|
||||
| [`TextCategorizer`](/api/textcategorizer) | Predict categories or labels over the whole document. |
|
||||
| [`Sentencizer`](/api/sentencizer) | Implement rule-based sentence boundary detection that doesn't require the dependency parse. |
|
||||
| [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries. |
|
||||
| [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. |
|
||||
| [`Pipe`](/api/pipe) | Base class that all trainable pipeline components inherit from. |
|
||||
|
||||
### Matchers {#architecture-matchers}
|
||||
|
||||
Matchers help you find and extract information from [`Doc`](/api/doc) objects
|
||||
based on match patterns describing the sequences you're looking for. A matcher
|
||||
operates on a `Doc` and gives you access to the matched tokens **in context**.
|
||||
|
||||
| Name | Description |
|
||||
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| [`Matcher`](/api/matcher) | Match sequences of tokens, based on pattern rules, similar to regular expressions. |
|
||||
| [`PhraseMatcher`](/api/phrasematcher) | Match sequences of tokens based on phrases. |
|
||||
| [`DependencyMatcher`](/api/dependencymatcher) | Match sequences of tokens based on dependency trees using the [Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html). |
|
||||
|
||||
### Other classes {#architecture-other}
|
||||
|
||||
| Name | Description |
|
||||
| --------------------------------- | ----------------------------------------------------------------------------- |
|
||||
| [`Vocab`](/api/vocab) | A lookup table for the vocabulary that allows you to access `Lexeme` objects. |
|
||||
| [`StringStore`](/api/stringstore) | Map strings to and from hash values. |
|
||||
| [`Vectors`](/api/vectors) | Container class for vector data keyed by string. |
|
||||
| [`Example`](/api/example) | Collection for training annotations. |
|
||||
| Name | Description |
|
||||
| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------- |
|
||||
| [`Vocab`](/api/vocab) | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects. |
|
||||
| [`StringStore`](/api/stringstore) | Map strings to and from hash values. |
|
||||
| [`Vectors`](/api/vectors) | Container class for vector data keyed by string. |
|
||||
| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
|
||||
| [`Morphology`](/api/morphology) | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. |
|
||||
| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis. |
|
||||
| [`KnowledgeBase`](/api/kb) | Storage for entities and aliases of a knowledge base for entity linking. |
|
||||
| [`Scorer`](/api/scorer) | Compute evaluation scores. |
|
||||
| [`Corpus`](/api/corpis) | Class for managing annotated corpora for training and evaluation data. |
|
||||
|
|
|
@ -11,7 +11,7 @@ import Link from './link'
|
|||
import GitHubCode from './github'
|
||||
import classes from '../styles/code.module.sass'
|
||||
|
||||
const WRAP_THRESHOLD = 15
|
||||
const WRAP_THRESHOLD = 16
|
||||
|
||||
export default props => (
|
||||
<Pre>
|
||||
|
|
Loading…
Reference in New Issue