From 2b07ab7db49f2de42a711c4493a6c0939bc0a598 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Fri, 24 Feb 2017 18:26:29 +0100
Subject: [PATCH] Add feature scheme to API docs (see #857, #739)

---
 website/docs/api/_data.json      |   7 +-
 website/docs/api/features.jade   | 138 +++++++++++++++++++++++++++++++
 website/docs/usage/training.jade |   3 +
 3 files changed, 147 insertions(+), 1 deletion(-)
 create mode 100644 website/docs/api/features.jade

diff --git a/website/docs/api/_data.json b/website/docs/api/_data.json
index b41f97b38..ddb0cd72a 100644
--- a/website/docs/api/_data.json
+++ b/website/docs/api/_data.json
@@ -21,7 +21,8 @@
             "GoldParse": "goldparse"
         },
         "Other": {
-            "Annotation Specs": "annotation"
+            "Annotation Specs": "annotation",
+            "Feature Scheme": "features"
         }
     },
 
@@ -111,5 +112,9 @@
 
     "annotation": {
         "title": "Annotation Specifications"
+    },
+
+    "features": {
+        "title": "Linear Model Feature Scheme"
     }
 }
diff --git a/website/docs/api/features.jade b/website/docs/api/features.jade
new file mode 100644
index 000000000..018790145
--- /dev/null
+++ b/website/docs/api/features.jade
@@ -0,0 +1,138 @@
+//- 💫 DOCS > API > LINEAR MOEL FEATURES
+
+include ../../_includes/_mixins
+
+p
+    |  There are two popular strategies for putting together machine learning
+    |  models for NLP: sparse linear models, and neural networks. To solve NLP
+    |  problems with linear models, feature templates need to be assembled that
+    |  combine multiple atomic predictors. This page documents the atomic
+    |  predictors used in the spaCy 1.0 #[+api("parser") #[code Parser]],
+    |  #[+api("tagger") #[code Tagger]] and
+    |  #[+api("entityrecognizer") #[code EntityRecognizer]].
+
+p
+    |  To understand the scheme, recall that spaCy's #[code Parser] and
+    |  #[code EntityRecognizer] are implemented as push-down automata. They
+    |  maintain a "stack" that holds the current entity, and a "buffer"
+    |  consisting of the words to be processed.
+
+p
+    |  Each state consists of the words on the stack (if any), which consistute
+    |  the current entity being constructed. We also have the current word, and
+    |  the two subsequent words. Finally, we also have the entities previously
+    |  built.
+
+p
+    |  This gives us a number of tokens to ask questions about, to make the
+    |  features. About each of these tokens, we can ask about a number of
+    |  different properties. Each feature identifier asks about a specific
+    |  property of a specific token of the context.
+
++h(2, "tokens") Context tokens
+
++table([ "ID", "Description" ])
+    +row
+        +cell #[code S0]
+        +cell
+            |  The first word on the stack, i.e. the token most recently added
+            |  to the current entity.
+
+    +row
+        +cell #[code S1]
+        +cell The second word on the stack, i.e. the second most recently added.
+
+    +row
+        +cell #[code S2]
+        +cell The third word on the stack, i.e. the third most recently added.
+
+    +row
+        +cell #[code N0]
+        +cell The first word of the buffer, i.e. the current word being tagged.
+
+    +row
+        +cell #[code N1]
+        +cell The second word of the buffer.
+
+    +row
+        +cell #[code N2]
+        +cell The third word of the buffer.
+
+    +row
+        +cell #[code P1]
+        +cell The word immediately before #[code N0].
+
+    +row
+        +cell #[code P2]
+        +cell The second word before #[code N0].
+
+    +row
+        +cell #[code E0]
+        +cell The first word of the previously constructed entity.
+
+    +row
+        +cell #[code E1]
+        +cell The first word of the second previously constructed entity.
+
+p About each of these tokens, we can ask:
+
++table([ "ID", "Attribute", "Description" ])
+    +row
+        +cell #[code N0w]
+        +cell #[code token.orth]
+        +cell The word form.
+
+    +row
+        +cell #[code N0W]
+        +cell #[code token.lemma]
+        +cell The word's lemma.
+
+    +row
+        +cell #[code N0p]
+        +cell #[code token.tag]
+        +cell The word's (full) POS tag.
+
+    +row
+        +cell #[code N0c]
+        +cell #[code token.cluster]
+        +cell The word's (full) Brown cluster.
+
+    +row
+        +cell #[code N0c4]
+        +cell -
+        +cell First four digit prefix of the word's Brown cluster.
+
+    +row
+        +cell #[code N0c6]
+        +cell -
+        +cell First six digit prefix of the word's Brown cluster.
+
+    +row
+        +cell #[code N0L]
+        +cell -
+        +cell The word's dependency label. Not used as a feature in the NER.
+
+    +row
+        +cell #[code N0_prefix]
+        +cell #[code token.prefix]
+        +cell The first three characters of the word.
+
+    +row
+        +cell #[code N0_suffix]
+        +cell #[code token.suffix]
+        +cell The last three characters of the word.
+
+    +row
+        +cell #[code N0_shape]
+        +cell #[code token.shape]
+        +cell The word's shape, i.e. is it alphabetic, numeric, etc.
+
+    +row
+        +cell #[code N0_ne_iob]
+        +cell #[code token.ent_iob]
+        +cell The Inside/Outside/Begin code of the word's NER tag.
+
+    +row
+        +cell #[code N0_ne_type]
+        +cell #[code token.ent_type]
+        +cell The word's NER type.
diff --git a/website/docs/usage/training.jade b/website/docs/usage/training.jade
index 6963730ab..da452ac83 100644
--- a/website/docs/usage/training.jade
+++ b/website/docs/usage/training.jade
@@ -74,6 +74,9 @@ p
     |  recognizer, with weights learned using the
     |  #[+a("https://explosion.ai/blog/part-of-speech-pos-tagger-in-python") Averaged Perceptron algorithm].
 
++aside("Linear Model Feature Scheme")
+    |  For a list of the available feature atoms, see the #[+a("/docs/api/features") Linear Model Feature Scheme].
+
 p
     |  Because it's a linear model, it's important for accuracy to build
     |  conjunction features out of the atomic predictors. Let's say you have