From 22dd929b65d62cdeb1fd65ceb9b304e15b7b90d9 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 3 Oct 2017 14:28:03 +0200 Subject: [PATCH] Add models documentation --- .gitignore | 2 +- website/models/_data.json | 95 +++++++++++++++++++++++++++++++++++++ website/models/de.jade | 6 +++ website/models/en.jade | 6 +++ website/models/es.jade | 6 +++ website/models/fr.jade | 6 +++ website/models/index.jade | 98 +++++++++++++++++++++++++++++++++++++++ website/models/xx.jade | 6 +++ 8 files changed, 224 insertions(+), 1 deletion(-) create mode 100644 website/models/_data.json create mode 100644 website/models/de.jade create mode 100644 website/models/en.jade create mode 100644 website/models/es.jade create mode 100644 website/models/fr.jade create mode 100644 website/models/index.jade create mode 100644 website/models/xx.jade diff --git a/.gitignore b/.gitignore index cb0a8e84e..572eea92d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ # spaCy spacy/data/ corpora/ -models/ +/models/ keys/ # Website diff --git a/website/models/_data.json b/website/models/_data.json new file mode 100644 index 000000000..cc26b9bc9 --- /dev/null +++ b/website/models/_data.json @@ -0,0 +1,95 @@ +{ + "sidebar": { + "Models": { + "Overview": "./" + }, + + "Language models": { + "English": "en", + "German": "de", + "Spanish": "es", + "French": "fr", + "Multi-Language": "xx" + } + }, + + "index": { + "title": "Models Overview", + "teaser": "Downloadable statistical models for spaCy to predict and assign linguistic features.", + "quickstart": true, + "menu": { + "Quickstart": "quickstart", + "Installation": "install", + "Naming Conventions": "conventions" + } + }, + + "MODELS": { + "en": ["en_core_web_sm", "en_core_web_lg", "en_vectors_web_lg"], + "de": ["de_dep_news_sm"], + "es": ["es_core_web_sm"], + "fr": [], + "xx": ["xx_ent_wiki_sm"] + }, + + "MODEL_META": { + "core": "Vocabulary, syntax, entities, vectors", + "dep": "Vocabulary, syntax", + "ent": "Named entities", + "vectors": "Word vectors", + "web": "written text (blogs, news, comments)", + "news": "written text (news, media)", + "wiki": "Wikipedia", + "uas": "Unlabelled dependencies", + "las": "Labelled dependencies", + "tags_acc": "Part-of-speech tags", + "ents_f": "Entities (F-score)", + "pipeline": "Processing pipeline components in order", + "sources": "Sources of training data" + }, + + "MODEL_LICENSES": { + "CC BY-SA": "https://creativecommons.org/licenses/by-sa/3.0/", + "CC BY-SA 3.0": "https://creativecommons.org/licenses/by-sa/3.0/", + "CC BY-NC": "https://creativecommons.org/licenses/by-nc/3.0/", + "CC BY-NC 3.0": "https://creativecommons.org/licenses/by-nc/3.0/" + }, + + "MODEL_ACCURACY": { + "uas": "UAS", + "las": "LAS", + "tags_acc": "POS", + "ents_f": "NER F" + }, + + "LANGUAGES": { + "en": "English", + "de": "German", + "fr": "French", + "es": "Spanish", + "it": "Italian", + "pt": "Portuguese", + "nl": "Dutch", + "sv": "Swedish", + "fi": "Finnish", + "nb": "Norwegian Bokmål", + "da": "Danish", + "hu": "Hungarian", + "pl": "Polish", + "he": "Hebrew", + "bn": "Bengali", + "id": "Indonesian", + "th": "Thai", + "zh": "Chinese", + "ja": "Japanese", + "xx": "Multi-language" + }, + + "EXAMPLE_SENTENCES": { + "en": "This is a sentence.", + "de": "Dies ist ein Satz.", + "fr": "C'est une phrase.", + "es": "Esto es una frase.", + "xx": "This is a sentence about Facebook." + } +} diff --git a/website/models/de.jade b/website/models/de.jade new file mode 100644 index 000000000..113290b7a --- /dev/null +++ b/website/models/de.jade @@ -0,0 +1,6 @@ +//- 💫 DOCS > MODELS > DE + +include ../_includes/_mixins + +//- This is a placeholder. The page is rendered via the template at +//- /_includes/_page-model.jade. diff --git a/website/models/en.jade b/website/models/en.jade new file mode 100644 index 000000000..4f400662b --- /dev/null +++ b/website/models/en.jade @@ -0,0 +1,6 @@ +//- 💫 DOCS > MODELS > EN + +include ../_includes/_mixins + +//- This is a placeholder. The page is rendered via the template at +//- /_includes/_page-model.jade. diff --git a/website/models/es.jade b/website/models/es.jade new file mode 100644 index 000000000..7aad72e81 --- /dev/null +++ b/website/models/es.jade @@ -0,0 +1,6 @@ +//- 💫 DOCS > MODELS > ES + +include ../_includes/_mixins + +//- This is a placeholder. The page is rendered via the template at +//- /_includes/_page-model.jade. diff --git a/website/models/fr.jade b/website/models/fr.jade new file mode 100644 index 000000000..1b3cc3fde --- /dev/null +++ b/website/models/fr.jade @@ -0,0 +1,6 @@ +//- 💫 DOCS > MODELS > FR + +include ../_includes/_mixins + +//- This is a placeholder. The page is rendered via the template at +//- /_includes/_page-model.jade. diff --git a/website/models/index.jade b/website/models/index.jade new file mode 100644 index 000000000..8f9aae739 --- /dev/null +++ b/website/models/index.jade @@ -0,0 +1,98 @@ +//- 💫 DOCS > MODELS + +include ../_includes/_mixins + ++section("quickstart") + p + | spaCy v2.0 features new neural models for #[strong tagging], + | #[strong parsing] and #[strong entity recognition]. The models have + | been designed and implemented from scratch specifically for spaCy, to + | give you an unmatched balance of speed, size and accuracy. A novel + | bloom embedding strategy with subword features is used to support + | huge vocabularies in tiny tables. Convolutional layers with residual + | connections, layer normalization and maxout non-linearity are used, + | giving much better efficiency than the standard BiLSTM solution. For + | more details, see the notes on the + | #[+a("/api/#nn-models") model architecture]. + + p + | The parser and NER use an imitation learning objective to + | deliver #[strong accuracy in-line with the latest research systems], + | even when evaluated from raw text. With these innovations, spaCy + | v2.0's models are #[strong 10× smaller], + | #[strong 20% more accurate], and #[strong just as fast] as the + | previous generation. + + include ../usage/_models/_quickstart + ++section("install") + +h(2, "install") Installation & Usage + + include ../usage/_models/_install-basics + + +infobox + | For more details on how to use models with spaCy, see the + | #[+a("/usage/models") usage guide on models]. + ++section("conventions") + +h(2, "model-naming") Model naming conventions + + p + | In general, spaCy expects all model packages to follow the naming + | convention of #[code [lang]_[name]]. For spaCy's models, we also + | chose to divide the name into three components: + + +table + +row + +cell #[+label Type] + +cell + | Model capabilities (e.g. #[code core] for general-purpose + | model with vocabulary, syntax, entities and word vectors, or + | #[code depent] for only vocab, syntax and entities). + +row + +cell #[+label Genre] + +cell + | Type of text the model is trained on, e.g. #[code web] or + | #[code news]. + +row + +cell #[+label Size] + +cell Model size indicator, #[code sm], #[code md] or #[code lg]. + + p + | For example, #[code en_core_web_sm] is a small English model trained + | on written web text (blogs, news, comments), that includes + | vocabulary, vectors, syntax and entities. + + +h(3, "model-versioning") Model versioning + + p + | Additionally, the model versioning reflects both the compatibility + | with spaCy, as well as the major and minor model version. A model + | version #[code a.b.c] translates to: + + +table + +row + +cell #[code a] + +cell + | #[strong spaCy major version]. For example, #[code 2] for + | spaCy v2.x. + +row + +cell #[code b] + +cell + | #[strong Model major version]. Models with a different major + | version can't be loaded by the same code. For example, + | changing the width of the model, adding hidden layers or + | changing the activation changes the model major version. + +row + +cell #[code c] + +cell + | #[strong Model minor version]. Same model structure, but + | different parameter values, e.g. from being trained on + | different data, for different numbers of iterations, etc. + + p + | For a detailed compatibility overview, see the + | #[+a(gh("spacy-models", "compatibility.json")) #[code compatibility.json]] + | in the models repository. This is also the source of spaCy's internal + | compatibility check, performed when you run the + | #[+api("cli#download") #[code download]] command. diff --git a/website/models/xx.jade b/website/models/xx.jade new file mode 100644 index 000000000..8967f38fa --- /dev/null +++ b/website/models/xx.jade @@ -0,0 +1,6 @@ +//- 💫 DOCS > MODELS > XX + +include ../_includes/_mixins + +//- This is a placeholder. The page is rendered via the template at +//- /_includes/_page-model.jade.