diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 53fd99ee8..a0ffa8f52 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -59,7 +59,8 @@ factory = "parser"
 
 [components.parser.model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 8
+state_type = "parser"
+extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
 use_upper = false
@@ -79,7 +80,8 @@ factory = "ner"
 
 [components.ner.model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 3
+state_type = "ner"
+extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
 use_upper = false
@@ -183,7 +185,8 @@ factory = "parser"
 
 [components.parser.model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 8
+state_type = "parser"
+extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
 use_upper = true
@@ -200,7 +203,8 @@ factory = "ner"
 
 [components.ner.model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 6
+state_type = "ner"
+extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
 use_upper = true
diff --git a/spacy/errors.py b/spacy/errors.py
index 153f8da0c..47a134c1f 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -480,6 +480,8 @@ class Errors:
     E201 = ("Span index out of range.")
 
     # TODO: fix numbering after merging develop into master
+    E917 = ("Received invalid value {value} for 'state_type' in "
+            "TransitionBasedParser: only 'parser' or 'ner' are valid options.")
     E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid "
             "values are an instance of spacy.vocab.Vocab or True to create one"
             " (default).")
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 5d091c590..2c40bb3ab 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -2,6 +2,7 @@ from typing import Optional, List
 from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
 from thinc.types import Floats2d
 
+from ...errors import Errors
 from ...compat import Literal
 from ...util import registry
 from .._precomputable_affine import PrecomputableAffine
@@ -12,7 +13,8 @@ from ...tokens import Doc
 @registry.architectures.register("spacy.TransitionBasedParser.v1")
 def build_tb_parser_model(
     tok2vec: Model[List[Doc], List[Floats2d]],
-    nr_feature_tokens: Literal[3, 6, 8, 13],
+    state_type: Literal["parser", "ner"],
+    extra_state_tokens: bool,
     hidden_width: int,
     maxout_pieces: int,
     use_upper: bool = True,
@@ -41,20 +43,12 @@ def build_tb_parser_model(
 
     tok2vec (Model[List[Doc], List[Floats2d]]):
         Subnetwork to map tokens into vector representations.
-    nr_feature_tokens (int): The number of tokens in the context to use to
-        construct the state vector. Valid choices are 3, 6, 8 and 13. The
-        8 and 13 feature sets are designed for the parser, while the 3 and 6
-        feature sets are designed for the NER. The recommended feature sets are
-        3 for NER, and 8 for the dependency parser.
-
-        TODO: This feature should be split into two, state_type: ["deps", "ner"]
-        and extra_state_features: [True, False]. This would map into:
-
-        (deps, False): 8
-        (deps, True): 13
-        (ner, False): 3
-        (ner, True): 6
-
+    state_type (str):
+        String value denoting the type of parser model: "parser" or "ner"
+    extra_state_tokens (bool): Whether or not to use additional tokens in the context
+        to construct the state vector. Defaults to `False`, which means 3 and 8
+        for the NER and parser respectively. When set to `True`, this would become 6
+        feature sets (for the NER) or 13 (for the parser).
     hidden_width (int): The width of the hidden layer.
     maxout_pieces (int): How many pieces to use in the state prediction layer.
         Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
@@ -69,8 +63,14 @@ def build_tb_parser_model(
         Usually inferred from data at the beginning of training, or loaded from
         disk.
     """
+    if state_type == "parser":
+        nr_feature_tokens = 13 if extra_state_tokens else 8
+    elif state_type == "ner":
+        nr_feature_tokens = 6 if extra_state_tokens else 3
+    else:
+        raise ValueError(Errors.E917.format(value=state_type))
     t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
-    tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),)
+    tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))
     tok2vec.set_dim("nO", hidden_width)
     lower = PrecomputableAffine(
         nO=hidden_width if use_upper else nO,
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index edd791e40..a49475c8e 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -15,7 +15,8 @@ from ..training import validate_examples
 default_model_config = """
 [model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 8
+state_type = "parser"
+extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
 
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index 2fa5c6392..fc4f03473 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -13,7 +13,8 @@ from ..training import validate_examples
 default_model_config = """
 [model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 6
+state_type = "ner"
+extra_state_tokens = false
 hidden_width = 64
 maxout_pieces = 2
 
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 5f25cbfe1..ec7544456 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -67,7 +67,8 @@ width = ${components.tok2vec.model.width}
 parser_config_string = """
 [model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 3
+state_type = "parser"
+extra_state_tokens = false
 hidden_width = 66
 maxout_pieces = 2
 
@@ -95,7 +96,11 @@ def my_parser():
         MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
     )
     parser = build_tb_parser_model(
-        tok2vec=tok2vec, nr_feature_tokens=8, hidden_width=65, maxout_pieces=5
+        tok2vec=tok2vec,
+        state_type="parser",
+        extra_state_tokens=True,
+        hidden_width=65,
+        maxout_pieces=5,
     )
     return parser
 
@@ -345,8 +350,8 @@ def test_config_auto_fill_extra_fields():
 def test_config_validate_literal():
     nlp = English()
     config = Config().from_str(parser_config_string)
-    config["model"]["nr_feature_tokens"] = 666
+    config["model"]["state_type"] = "nonsense"
     with pytest.raises(ConfigValidationError):
         nlp.add_pipe("parser", config=config)
-    config["model"]["nr_feature_tokens"] = 13
+    config["model"]["state_type"] = "ner"
     nlp.add_pipe("parser", config=config)
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 8797b2f31..ef2666ec0 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -414,7 +414,8 @@ one component.
 > ```ini
 > [model]
 > @architectures = "spacy.TransitionBasedParser.v1"
-> nr_feature_tokens = 6
+> state_type = "ner"
+> extra_state_tokens = false
 > hidden_width = 64
 > maxout_pieces = 2
 >
@@ -446,15 +447,16 @@ consists of either two or three subnetworks:
   state representation. If not present, the output from the lower model is used
   as action scores directly.
 
-| Name                | Description                                                                                                                                                                                                                                                                                                                                                             |
-| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`           | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
-| `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `3`, `6`, `8` and `13`. The `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~                      |
-| `hidden_width`      | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
-| `maxout_pieces`     | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
-| `use_upper`         | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
-| `nO`                | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
-| **CREATES**         | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |
+| Name                 | Description                                                                                                                                                                                                                                                                                                                                                             |
+| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec`            | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                              |
+| `state_type`         | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~                                                                                                                                                                                                                                                                                     |
+| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~                                                                                                                                                                                                       |
+| `hidden_width`       | The width of the hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                                                  |
+| `maxout_pieces`      | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~                                                                                      |
+| `use_upper`          | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
+| `nO`                 | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~                                                                                                                                                                                                                             |
+| **CREATES**          | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~                                                                                                                                                                                                                                                                                           |
 
 ## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
 
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index a855d703c..d61172a5b 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -448,7 +448,8 @@ factory = "ner"
 
 [nlp.pipeline.ner.model]
 @architectures = "spacy.TransitionBasedParser.v1"
-nr_feature_tokens = 3
+state_type = "ner"
+extra_state_tokens = false
 hidden_width = 128
 maxout_pieces = 3
 use_upper = false