Add FeatureExtractor from Thinc (#6170)

* move featureextractor from Thinc

* Update website/docs/api/architectures.md

Co-authored-by: Ines Montani <ines@ines.io>

* Update website/docs/api/architectures.md

Co-authored-by: Ines Montani <ines@ines.io>

Co-authored-by: Ines Montani <ines@ines.io>
This commit is contained in:
Sofie Van Landeghem 2020-10-01 16:22:48 +02:00 committed by GitHub
parent 73538782a0
commit a22215f427
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 54 additions and 9 deletions

View File

@ -0,0 +1,25 @@
from typing import List, Union, Callable, Tuple
from thinc.types import Ints2d, Doc
from thinc.api import Model, registry
@registry.layers("spacy.FeatureExtractor.v1")
def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]:
return Model("extract_features", forward, attrs={"columns": columns})
def forward(model: Model[List[Doc], List[Ints2d]], docs, is_train: bool) -> Tuple[List[Ints2d], Callable]:
columns = model.attrs["columns"]
features: List[Ints2d] = []
for doc in docs:
if hasattr(doc, "to_array"):
attrs = doc.to_array(columns)
else:
attrs = doc.doc.to_array(columns)[doc.start : doc.end]
if attrs.ndim == 1:
attrs = attrs.reshape((attrs.shape[0], 1))
features.append(model.ops.asarray2i(attrs, dtype="uint64"))
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
return features, backprop

View File

@ -3,12 +3,13 @@ from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
from thinc.api import Relu, residual, expand_window, FeatureExtractor
from thinc.api import Relu, residual, expand_window
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
from ...util import registry
from ..extract_ngrams import extract_ngrams
from ..staticvectors import StaticVectors
from ..featureextractor import FeatureExtractor
@registry.architectures.register("spacy.TextCatCNN.v1")

View File

@ -1,14 +1,14 @@
from typing import Optional, List
from thinc.api import chain, clone, concatenate, with_array, with_padded
from thinc.api import Model, noop, list2ragged, ragged2list
from thinc.api import FeatureExtractor, HashEmbed
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
from thinc.types import Floats2d
from thinc.api import chain, clone, concatenate, with_array, with_padded
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
from ...tokens import Doc
from ...util import registry
from ...ml import _character_embed
from ..staticvectors import StaticVectors
from ..featureextractor import FeatureExtractor
from ...pipeline.tok2vec import Tok2VecListener
from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE

View File

@ -144,9 +144,9 @@ argument that connects to the shared `tok2vec` component in the pipeline.
Construct an embedding layer that separately embeds a number of lexical
attributes using hash embedding, concatenates the results, and passes it through
a feed-forward subnetwork to build mixed representations. The features used are
the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying definitions
depending on the `Vocab` of the `Doc` object passed in. Vectors from pretrained
static vectors can also be incorporated into the concatenated representation.
the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, and they are extracted with a
[FeatureExtractor](/api/architectures#FeatureExtractor) layer. Vectors from pretrained static
vectors can also be incorporated into the concatenated representation.
| Name | Description |
| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -291,6 +291,24 @@ on [static vectors](/usage/embeddings-transformers#static-vectors) for details.
| `key_attr` | Defaults to `"ORTH"`. ~~str~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ |
### spacy.FeatureExtractor.v1 {#FeatureExtractor}
> #### Example config
>
> ```ini
> [model]
> @architectures = "spacy.FeatureExtractor.v1"
> columns = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
> ```
Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list
of feature names to extract, which should refer to token attributes.
| Name |  Description |
| ----------- | ------------------------------------------------------------------------ |
| `columns` | The token attributes to extract. ~~List[Union[int, str]]~~ |
| **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ |
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
The following architectures are provided by the package

View File

@ -585,8 +585,9 @@ vectors, but combines them via summation with a smaller table of learned
embeddings.
```python
from thinc.api import add, chain, remap_ids, Embed, FeatureExtractor
from thinc.api import add, chain, remap_ids, Embed
from spacy.ml.staticvectors import StaticVectors
from spacy.ml.featureextractor import FeatureExtractor
from spacy.util import registry
@registry.architectures("my_example.MyEmbedding.v1")