Add models documentation

2017-10-03 14:28:03 +02:00 · 2017-10-03 14:28:03 +02:00 · 22dd929b65
parent 808f7ee417
commit 22dd929b65
8 changed files with 224 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,7 @@
 # spaCy
 spacy/data/
 corpora/
-models/
+/models/
 keys/

 # Website
--- a/website/models/_data.json
+++ b/website/models/_data.json
@ -0,0 +1,95 @@
+{
+    "sidebar": {
+        "Models": {
+            "Overview": "./"
+        },
+
+        "Language models": {
+            "English": "en",
+            "German": "de",
+            "Spanish": "es",
+            "French": "fr",
+            "Multi-Language": "xx"
+        }
+    },
+
+    "index": {
+        "title": "Models Overview",
+        "teaser": "Downloadable statistical models for spaCy to predict and assign linguistic features.",
+        "quickstart": true,
+        "menu": {
+            "Quickstart": "quickstart",
+            "Installation": "install",
+            "Naming Conventions": "conventions"
+        }
+    },
+
+    "MODELS": {
+        "en": ["en_core_web_sm", "en_core_web_lg", "en_vectors_web_lg"],
+        "de": ["de_dep_news_sm"],
+        "es": ["es_core_web_sm"],
+        "fr": [],
+        "xx": ["xx_ent_wiki_sm"]
+    },
+
+    "MODEL_META": {
+        "core": "Vocabulary, syntax, entities, vectors",
+        "dep": "Vocabulary, syntax",
+        "ent": "Named entities",
+        "vectors": "Word vectors",
+        "web": "written text (blogs, news, comments)",
+        "news": "written text (news, media)",
+        "wiki": "Wikipedia",
+        "uas": "Unlabelled dependencies",
+        "las": "Labelled dependencies",
+        "tags_acc": "Part-of-speech tags",
+        "ents_f": "Entities (F-score)",
+        "pipeline": "Processing pipeline components in order",
+        "sources": "Sources of training data"
+    },
+
+    "MODEL_LICENSES": {
+        "CC BY-SA": "https://creativecommons.org/licenses/by-sa/3.0/",
+        "CC BY-SA 3.0": "https://creativecommons.org/licenses/by-sa/3.0/",
+        "CC BY-NC": "https://creativecommons.org/licenses/by-nc/3.0/",
+        "CC BY-NC 3.0": "https://creativecommons.org/licenses/by-nc/3.0/"
+    },
+
+    "MODEL_ACCURACY": {
+        "uas": "UAS",
+        "las": "LAS",
+        "tags_acc": "POS",
+        "ents_f": "NER F"
+    },
+
+    "LANGUAGES": {
+        "en": "English",
+        "de": "German",
+        "fr": "French",
+        "es": "Spanish",
+        "it": "Italian",
+        "pt": "Portuguese",
+        "nl": "Dutch",
+        "sv": "Swedish",
+        "fi": "Finnish",
+        "nb": "Norwegian Bokmål",
+        "da": "Danish",
+        "hu": "Hungarian",
+        "pl": "Polish",
+        "he": "Hebrew",
+        "bn": "Bengali",
+        "id": "Indonesian",
+        "th": "Thai",
+        "zh": "Chinese",
+        "ja": "Japanese",
+        "xx": "Multi-language"
+    },
+
+    "EXAMPLE_SENTENCES": {
+        "en": "This is a sentence.",
+        "de": "Dies ist ein Satz.",
+        "fr": "C'est une phrase.",
+        "es": "Esto es una frase.",
+        "xx": "This is a sentence about Facebook."
+    }
+}
--- a/website/models/de.jade
+++ b/website/models/de.jade
@ -0,0 +1,6 @@
+//- 💫 DOCS > MODELS > DE
+
+include ../_includes/_mixins
+
+//- This is a placeholder. The page is rendered via the template at
+//- /_includes/_page-model.jade.
--- a/website/models/en.jade
+++ b/website/models/en.jade
@ -0,0 +1,6 @@
+//- 💫 DOCS > MODELS > EN
+
+include ../_includes/_mixins
+
+//- This is a placeholder. The page is rendered via the template at
+//- /_includes/_page-model.jade.
--- a/website/models/es.jade
+++ b/website/models/es.jade
@ -0,0 +1,6 @@
+//- 💫 DOCS > MODELS > ES
+
+include ../_includes/_mixins
+
+//- This is a placeholder. The page is rendered via the template at
+//- /_includes/_page-model.jade.
--- a/website/models/fr.jade
+++ b/website/models/fr.jade
@ -0,0 +1,6 @@
+//- 💫 DOCS > MODELS > FR
+
+include ../_includes/_mixins
+
+//- This is a placeholder. The page is rendered via the template at
+//- /_includes/_page-model.jade.
--- a/website/models/index.jade
+++ b/website/models/index.jade
@ -0,0 +1,98 @@
+//- 💫 DOCS > MODELS
+
+include ../_includes/_mixins
+
+section("quickstart")
+    p
+        |  spaCy v2.0 features new neural models for #[strong tagging],
+        |  #[strong parsing] and #[strong entity recognition]. The models have
+        |  been designed and implemented from scratch specifically for spaCy, to
+        |  give you an unmatched balance of speed, size and accuracy. A novel
+        |  bloom embedding strategy with subword features is used to support
+        |  huge vocabularies in tiny tables. Convolutional layers with residual
+        |  connections, layer normalization and maxout non-linearity are used,
+        |  giving much better efficiency than the standard BiLSTM solution. For
+        |  more details, see the notes on the
+        |  #[+a("/api/#nn-models") model architecture].
+
+    p
+        |  The parser and NER use an imitation learning objective to
+        |  deliver #[strong accuracy in-line with the latest research systems],
+        |  even when  evaluated from raw text. With these innovations, spaCy
+        |  v2.0's models are #[strong 10&times; smaller],
+        |  #[strong 20% more accurate], and #[strong just as fast] as the
+        |  previous generation.
+
+    include ../usage/_models/_quickstart
+
+section("install")
+    +h(2, "install") Installation & Usage
+
+    include ../usage/_models/_install-basics
+
+    +infobox
+        |  For more details on how to use models with spaCy, see the
+        |  #[+a("/usage/models") usage guide on models].
+
+section("conventions")
+    +h(2, "model-naming") Model naming conventions
+
+    p
+        |  In general, spaCy expects all model packages to follow the naming
+        |  convention of #[code [lang]_[name]]. For spaCy's models, we also
+        |  chose to divide the name into three components:
+
+    +table
+        +row
+            +cell #[+label Type]
+            +cell
+                |  Model capabilities (e.g. #[code core] for general-purpose
+                |  model with vocabulary, syntax, entities and word vectors, or
+                |  #[code depent] for only vocab, syntax and entities).
+        +row
+            +cell #[+label Genre]
+            +cell
+                |  Type of text the model is trained on, e.g. #[code web] or
+                |  #[code news].
+        +row
+            +cell #[+label Size]
+            +cell Model size indicator, #[code sm], #[code md] or #[code lg].
+
+    p
+        |  For example, #[code en_core_web_sm] is a small English model trained
+        |  on written web text (blogs, news, comments), that includes
+        |  vocabulary, vectors, syntax and entities.
+
+    +h(3, "model-versioning") Model versioning
+
+    p
+        |  Additionally, the model versioning reflects both the compatibility
+        |  with spaCy, as well as the major and minor model version. A model
+        |  version #[code a.b.c] translates to:
+
+    +table
+        +row
+            +cell #[code a]
+            +cell
+                |  #[strong spaCy major version]. For example, #[code 2] for
+                |  spaCy v2.x.
+        +row
+            +cell #[code b]
+            +cell
+                |  #[strong Model major version]. Models with a different major
+                |  version can't be loaded by the same code. For example,
+                |  changing the width of the model, adding hidden layers or
+                |  changing the activation changes the model major version.
+        +row
+            +cell #[code c]
+            +cell
+                |  #[strong Model minor version]. Same model structure, but
+                |  different parameter values, e.g. from being trained on
+                |  different data, for different numbers of iterations, etc.
+
+    p
+        |  For a detailed compatibility overview, see the
+        |  #[+a(gh("spacy-models", "compatibility.json")) #[code compatibility.json]]
+        |  in the models repository. This is also the source of spaCy's internal
+        |  compatibility check, performed when you run the
+        |  #[+api("cli#download") #[code download]] command.
--- a/website/models/xx.jade
+++ b/website/models/xx.jade
@ -0,0 +1,6 @@
+//- 💫 DOCS > MODELS > XX
+
+include ../_includes/_mixins
+
+//- This is a placeholder. The page is rendered via the template at
+//- /_includes/_page-model.jade.