Add FeatureExtractor from Thinc (#6170)

* move featureextractor from Thinc

* Update website/docs/api/architectures.md

Co-authored-by: Ines Montani <ines@ines.io>

* Update website/docs/api/architectures.md

Co-authored-by: Ines Montani <ines@ines.io>

Co-authored-by: Ines Montani <ines@ines.io>
This commit is contained in:
Sofie Van Landeghem 2020-10-01 16:22:48 +02:00 committed by GitHub
parent 73538782a0
commit a22215f427
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 54 additions and 9 deletions

View File

@ -0,0 +1,25 @@
from typing import List, Union, Callable, Tuple
from thinc.types import Ints2d, Doc
from thinc.api import Model, registry
@registry.layers("spacy.FeatureExtractor.v1")
def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]:
return Model("extract_features", forward, attrs={"columns": columns})
def forward(model: Model[List[Doc], List[Ints2d]], docs, is_train: bool) -> Tuple[List[Ints2d], Callable]:
columns = model.attrs["columns"]
features: List[Ints2d] = []
for doc in docs:
if hasattr(doc, "to_array"):
attrs = doc.to_array(columns)
else:
attrs = doc.doc.to_array(columns)[doc.start : doc.end]
if attrs.ndim == 1:
attrs = attrs.reshape((attrs.shape[0], 1))
features.append(model.ops.asarray2i(attrs, dtype="uint64"))
backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
return features, backprop

View File

@ -3,12 +3,13 @@ from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
from thinc.api import HashEmbed, with_array, with_cpu, uniqued from thinc.api import HashEmbed, with_array, with_cpu, uniqued
from thinc.api import Relu, residual, expand_window, FeatureExtractor from thinc.api import Relu, residual, expand_window
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
from ...util import registry from ...util import registry
from ..extract_ngrams import extract_ngrams from ..extract_ngrams import extract_ngrams
from ..staticvectors import StaticVectors from ..staticvectors import StaticVectors
from ..featureextractor import FeatureExtractor
@registry.architectures.register("spacy.TextCatCNN.v1") @registry.architectures.register("spacy.TextCatCNN.v1")

View File

@ -1,14 +1,14 @@
from typing import Optional, List from typing import Optional, List
from thinc.api import chain, clone, concatenate, with_array, with_padded
from thinc.api import Model, noop, list2ragged, ragged2list
from thinc.api import FeatureExtractor, HashEmbed
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
from thinc.types import Floats2d from thinc.types import Floats2d
from thinc.api import chain, clone, concatenate, with_array, with_padded
from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
from ...tokens import Doc from ...tokens import Doc
from ...util import registry from ...util import registry
from ...ml import _character_embed from ...ml import _character_embed
from ..staticvectors import StaticVectors from ..staticvectors import StaticVectors
from ..featureextractor import FeatureExtractor
from ...pipeline.tok2vec import Tok2VecListener from ...pipeline.tok2vec import Tok2VecListener
from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE

View File

@ -144,9 +144,9 @@ argument that connects to the shared `tok2vec` component in the pipeline.
Construct an embedding layer that separately embeds a number of lexical Construct an embedding layer that separately embeds a number of lexical
attributes using hash embedding, concatenates the results, and passes it through attributes using hash embedding, concatenates the results, and passes it through
a feed-forward subnetwork to build mixed representations. The features used are a feed-forward subnetwork to build mixed representations. The features used are
the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying definitions the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, and they are extracted with a
depending on the `Vocab` of the `Doc` object passed in. Vectors from pretrained [FeatureExtractor](/api/architectures#FeatureExtractor) layer. Vectors from pretrained static
static vectors can also be incorporated into the concatenated representation. vectors can also be incorporated into the concatenated representation.
| Name | Description | | Name | Description |
| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -291,6 +291,24 @@ on [static vectors](/usage/embeddings-transformers#static-vectors) for details.
| `key_attr` | Defaults to `"ORTH"`. ~~str~~ | | `key_attr` | Defaults to `"ORTH"`. ~~str~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ |
### spacy.FeatureExtractor.v1 {#FeatureExtractor}
> #### Example config
>
> ```ini
> [model]
> @architectures = "spacy.FeatureExtractor.v1"
> columns = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
> ```
Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list
of feature names to extract, which should refer to token attributes.
| Name |  Description |
| ----------- | ------------------------------------------------------------------------ |
| `columns` | The token attributes to extract. ~~List[Union[int, str]]~~ |
| **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ |
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"} ## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
The following architectures are provided by the package The following architectures are provided by the package

View File

@ -585,8 +585,9 @@ vectors, but combines them via summation with a smaller table of learned
embeddings. embeddings.
```python ```python
from thinc.api import add, chain, remap_ids, Embed, FeatureExtractor from thinc.api import add, chain, remap_ids, Embed
from spacy.ml.staticvectors import StaticVectors from spacy.ml.staticvectors import StaticVectors
from spacy.ml.featureextractor import FeatureExtractor
from spacy.util import registry from spacy.util import registry
@registry.architectures("my_example.MyEmbedding.v1") @registry.architectures("my_example.MyEmbedding.v1")