diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 53fd99ee8..a0ffa8f52 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -59,7 +59,8 @@ factory = "parser" [components.parser.model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 8 +state_type = "parser" +extra_state_tokens = false hidden_width = 128 maxout_pieces = 3 use_upper = false @@ -79,7 +80,8 @@ factory = "ner" [components.ner.model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 3 +state_type = "ner" +extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 use_upper = false @@ -183,7 +185,8 @@ factory = "parser" [components.parser.model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 8 +state_type = "parser" +extra_state_tokens = false hidden_width = 128 maxout_pieces = 3 use_upper = true @@ -200,7 +203,8 @@ factory = "ner" [components.ner.model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 6 +state_type = "ner" +extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 use_upper = true diff --git a/spacy/errors.py b/spacy/errors.py index 153f8da0c..47a134c1f 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -480,6 +480,8 @@ class Errors: E201 = ("Span index out of range.") # TODO: fix numbering after merging develop into master + E917 = ("Received invalid value {value} for 'state_type' in " + "TransitionBasedParser: only 'parser' or 'ner' are valid options.") E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid " "values are an instance of spacy.vocab.Vocab or True to create one" " (default).") diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index 5d091c590..2c40bb3ab 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -2,6 +2,7 @@ from typing import Optional, List from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops from thinc.types import Floats2d +from ...errors import Errors from ...compat import Literal from ...util import registry from .._precomputable_affine import PrecomputableAffine @@ -12,7 +13,8 @@ from ...tokens import Doc @registry.architectures.register("spacy.TransitionBasedParser.v1") def build_tb_parser_model( tok2vec: Model[List[Doc], List[Floats2d]], - nr_feature_tokens: Literal[3, 6, 8, 13], + state_type: Literal["parser", "ner"], + extra_state_tokens: bool, hidden_width: int, maxout_pieces: int, use_upper: bool = True, @@ -41,20 +43,12 @@ def build_tb_parser_model( tok2vec (Model[List[Doc], List[Floats2d]]): Subnetwork to map tokens into vector representations. - nr_feature_tokens (int): The number of tokens in the context to use to - construct the state vector. Valid choices are 3, 6, 8 and 13. The - 8 and 13 feature sets are designed for the parser, while the 3 and 6 - feature sets are designed for the NER. The recommended feature sets are - 3 for NER, and 8 for the dependency parser. - - TODO: This feature should be split into two, state_type: ["deps", "ner"] - and extra_state_features: [True, False]. This would map into: - - (deps, False): 8 - (deps, True): 13 - (ner, False): 3 - (ner, True): 6 - + state_type (str): + String value denoting the type of parser model: "parser" or "ner" + extra_state_tokens (bool): Whether or not to use additional tokens in the context + to construct the state vector. Defaults to `False`, which means 3 and 8 + for the NER and parser respectively. When set to `True`, this would become 6 + feature sets (for the NER) or 13 (for the parser). hidden_width (int): The width of the hidden layer. maxout_pieces (int): How many pieces to use in the state prediction layer. Recommended values are 1, 2 or 3. If 1, the maxout non-linearity @@ -69,8 +63,14 @@ def build_tb_parser_model( Usually inferred from data at the beginning of training, or loaded from disk. """ + if state_type == "parser": + nr_feature_tokens = 13 if extra_state_tokens else 8 + elif state_type == "ner": + nr_feature_tokens = 6 if extra_state_tokens else 3 + else: + raise ValueError(Errors.E917.format(value=state_type)) t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None - tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),) + tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width)) tok2vec.set_dim("nO", hidden_width) lower = PrecomputableAffine( nO=hidden_width if use_upper else nO, diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index edd791e40..a49475c8e 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -15,7 +15,8 @@ from ..training import validate_examples default_model_config = """ [model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 8 +state_type = "parser" +extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index 2fa5c6392..fc4f03473 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -13,7 +13,8 @@ from ..training import validate_examples default_model_config = """ [model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 6 +state_type = "ner" +extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 5f25cbfe1..ec7544456 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -67,7 +67,8 @@ width = ${components.tok2vec.model.width} parser_config_string = """ [model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 3 +state_type = "parser" +extra_state_tokens = false hidden_width = 66 maxout_pieces = 2 @@ -95,7 +96,11 @@ def my_parser(): MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2), ) parser = build_tb_parser_model( - tok2vec=tok2vec, nr_feature_tokens=8, hidden_width=65, maxout_pieces=5 + tok2vec=tok2vec, + state_type="parser", + extra_state_tokens=True, + hidden_width=65, + maxout_pieces=5, ) return parser @@ -345,8 +350,8 @@ def test_config_auto_fill_extra_fields(): def test_config_validate_literal(): nlp = English() config = Config().from_str(parser_config_string) - config["model"]["nr_feature_tokens"] = 666 + config["model"]["state_type"] = "nonsense" with pytest.raises(ConfigValidationError): nlp.add_pipe("parser", config=config) - config["model"]["nr_feature_tokens"] = 13 + config["model"]["state_type"] = "ner" nlp.add_pipe("parser", config=config) diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 8797b2f31..ef2666ec0 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -414,7 +414,8 @@ one component. > ```ini > [model] > @architectures = "spacy.TransitionBasedParser.v1" -> nr_feature_tokens = 6 +> state_type = "ner" +> extra_state_tokens = false > hidden_width = 64 > maxout_pieces = 2 > @@ -446,15 +447,16 @@ consists of either two or three subnetworks: state representation. If not present, the output from the lower model is used as action scores directly. -| Name | Description | -| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | -| `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `3`, `6`, `8` and `13`. The `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~ | -| `hidden_width` | The width of the hidden layer. ~~int~~ | -| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ | -| `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ | -| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ | +| Name | Description | +| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | +| `state_type` | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~ | +| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ | +| `hidden_width` | The width of the hidden layer. ~~int~~ | +| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ | +| `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ | +| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ | ## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"} diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index a855d703c..d61172a5b 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -448,7 +448,8 @@ factory = "ner" [nlp.pipeline.ner.model] @architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 3 +state_type = "ner" +extra_state_tokens = false hidden_width = 128 maxout_pieces = 3 use_upper = false