diff --git a/spacy/ml/featureextractor.py b/spacy/ml/featureextractor.py new file mode 100644 index 000000000..dcf212628 --- /dev/null +++ b/spacy/ml/featureextractor.py @@ -0,0 +1,25 @@ +from typing import List, Union, Callable, Tuple +from thinc.types import Ints2d, Doc +from thinc.api import Model, registry + + + +@registry.layers("spacy.FeatureExtractor.v1") +def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]: + return Model("extract_features", forward, attrs={"columns": columns}) + + +def forward(model: Model[List[Doc], List[Ints2d]], docs, is_train: bool) -> Tuple[List[Ints2d], Callable]: + columns = model.attrs["columns"] + features: List[Ints2d] = [] + for doc in docs: + if hasattr(doc, "to_array"): + attrs = doc.to_array(columns) + else: + attrs = doc.doc.to_array(columns)[doc.start : doc.end] + if attrs.ndim == 1: + attrs = attrs.reshape((attrs.shape[0], 1)) + features.append(model.ops.asarray2i(attrs, dtype="uint64")) + + backprop: Callable[[List[Ints2d]], List] = lambda d_features: [] + return features, backprop diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 16293cda4..1117b4fde 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -3,12 +3,13 @@ from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum from thinc.api import HashEmbed, with_array, with_cpu, uniqued -from thinc.api import Relu, residual, expand_window, FeatureExtractor +from thinc.api import Relu, residual, expand_window from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER from ...util import registry from ..extract_ngrams import extract_ngrams from ..staticvectors import StaticVectors +from ..featureextractor import FeatureExtractor @registry.architectures.register("spacy.TextCatCNN.v1") diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index fec478e21..95f9c66df 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -1,14 +1,14 @@ from typing import Optional, List -from thinc.api import chain, clone, concatenate, with_array, with_padded -from thinc.api import Model, noop, list2ragged, ragged2list -from thinc.api import FeatureExtractor, HashEmbed -from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM from thinc.types import Floats2d +from thinc.api import chain, clone, concatenate, with_array, with_padded +from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed +from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM from ...tokens import Doc from ...util import registry from ...ml import _character_embed from ..staticvectors import StaticVectors +from ..featureextractor import FeatureExtractor from ...pipeline.tok2vec import Tok2VecListener from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 3e6fbb283..5cee45ba5 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -144,9 +144,9 @@ argument that connects to the shared `tok2vec` component in the pipeline. Construct an embedding layer that separately embeds a number of lexical attributes using hash embedding, concatenates the results, and passes it through a feed-forward subnetwork to build mixed representations. The features used are -the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying definitions -depending on the `Vocab` of the `Doc` object passed in. Vectors from pretrained -static vectors can also be incorporated into the concatenated representation. +the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, and they are extracted with a +[FeatureExtractor](/api/architectures#FeatureExtractor) layer. Vectors from pretrained static +vectors can also be incorporated into the concatenated representation. | Name | Description | | ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -291,6 +291,24 @@ on [static vectors](/usage/embeddings-transformers#static-vectors) for details. | `key_attr` | Defaults to `"ORTH"`. ~~str~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ | +### spacy.FeatureExtractor.v1 {#FeatureExtractor} + +> #### Example config +> +> ```ini +> [model] +> @architectures = "spacy.FeatureExtractor.v1" +> columns = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"] +> ``` + +Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list +of feature names to extract, which should refer to token attributes. + +| Name |  Description | +| ----------- | ------------------------------------------------------------------------ | +| `columns` | The token attributes to extract. ~~List[Union[int, str]]~~ | +| **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ | + ## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"} The following architectures are provided by the package diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index d5c7ee93a..1b78b8dc5 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -585,8 +585,9 @@ vectors, but combines them via summation with a smaller table of learned embeddings. ```python -from thinc.api import add, chain, remap_ids, Embed, FeatureExtractor +from thinc.api import add, chain, remap_ids, Embed from spacy.ml.staticvectors import StaticVectors +from spacy.ml.featureextractor import FeatureExtractor from spacy.util import registry @registry.architectures("my_example.MyEmbedding.v1")