diff --git a/website/_includes/_page_models.jade b/website/_includes/_page_models.jade index 1cab930fb..c7742fa38 100644 --- a/website/_includes/_page_models.jade +++ b/website/_includes/_page_models.jade @@ -40,6 +40,8 @@ for id in CURRENT_MODELS each label in ["Pipeline", "Vectors", "Sources", "Author", "License"] - var field = label.toLowerCase() + if field == "vectors" + - field = "vecs" +row +cell.u-nowrap +label=label diff --git a/website/assets/js/models.js b/website/assets/js/models.js index 2d371ee1f..f5757c8cb 100644 --- a/website/assets/js/models.js +++ b/website/assets/js/models.js @@ -20,21 +20,33 @@ const CHART_FONTS = { * @property {function} vectors - Format vector data (entries and dimensions). * @property {function} version - Format model version number. */ -export const formats = { +const formats = { author: (author, url) => url ? `${author}` : author, license: (license, url) => url ? `${license}` : license, sources: sources => (sources instanceof Array) ? sources.join(', ') : sources, pipeline: pipes => (pipes && pipes.length) ? pipes.map(p => `${p}`).join(', ') : '-', - vectors: vec => vec ? `${abbrNumber(vec.keys)} keys, ${abbrNumber(vec.vectors)} unique vectors (${vec.width} dimensions)` : 'n/a', + vectors: vec => formatVectors(vec), version: version => `v${version}` }; +/** + * Format word vectors data depending on contents. + * @property {Object} data - The vectors object from the model's meta.json. + */ +const formatVectors = data => { + if (!data) return 'n/a'; + if (Object.values(data).every(n => n == 0)) return 'context vectors only'; + const { keys, vectors: vecs, width } = data; + return `${abbrNumber(keys)} keys, ${abbrNumber(vecs)} unique vectors (${width} dimensions)`; +} + + /** * Find the latest version of a model in a compatibility table. * @param {string} model - The model name. * @param {Object} compat - Compatibility table, keyed by spaCy version. */ -export const getLatestVersion = (model, compat = {}) => { +const getLatestVersion = (model, compat = {}) => { for (let [spacy_v, models] of Object.entries(compat)) { if (models[model]) return models[model][0]; } @@ -90,7 +102,7 @@ export class ModelLoader { const tpl = new Templater(modelId); tpl.get('table').removeAttribute('data-loading'); tpl.get('error').style.display = 'block'; - for (let key of ['sources', 'pipeline', 'vectors', 'author', 'license']) { + for (let key of ['sources', 'pipeline', 'vecs', 'author', 'license']) { tpl.get(key).parentElement.parentElement.style.display = 'none'; } } @@ -120,8 +132,8 @@ export class ModelLoader { if (author) tpl.fill('author', formats.author(author, url), true); if (license) tpl.fill('license', formats.license(license, this.licenses[license]), true); if (sources) tpl.fill('sources', formats.sources(sources)); - if (vectors) tpl.fill('vectors', formats.vectors(vectors)); - else tpl.get('vectors').parentElement.parentElement.style.display = 'none'; + if (vectors) tpl.fill('vecs', formats.vectors(vectors)); + else tpl.get('vecs').parentElement.parentElement.style.display = 'none'; if (pipeline && pipeline.length) tpl.fill('pipeline', formats.pipeline(pipeline), true); else tpl.get('pipeline').parentElement.parentElement.style.display = 'none'; } @@ -223,8 +235,9 @@ export class ModelComparer { const version = getLatestVersion(name, this.compat); const modelName = `${name}-${version}`; return new Promise((resolve, reject) => { + if (!version) reject(); // resolve immediately if model already loaded, e.g. in this.models - if (this.models[name]) resolve(this.models[name]); + else if (this.models[name]) resolve(this.models[name]); else fetch(`${this.url}/meta/${modelName}.json`) .then(res => handleResponse(res)) .then(json => json.ok ? resolve(this.saveModel(name, json)) : reject()) @@ -306,12 +319,13 @@ export class ModelComparer { this.tpl.fill(`size${i}`, size); this.tpl.fill(`desc${i}`, description || 'n/a'); this.tpl.fill(`pipeline${i}`, formats.pipeline(pipeline), true); - this.tpl.fill(`vectors${i}`, formats.vectors(vectors)); + this.tpl.fill(`vecs${i}`, formats.vectors(vectors)); this.tpl.fill(`sources${i}`, formats.sources(sources)); this.tpl.fill(`author${i}`, formats.author(author, url), true); this.tpl.fill(`license${i}`, formats.license(license, this.licenses[license]), true); // check if model accuracy or speed includes one of the pre-set keys - for (let key of [...metaKeys, ...Object.keys(this.benchKeys.speed)]) { + const allKeys = [].concat(...Object.entries(this.benchKeys).map(([_, v]) => Object.keys(v))); + for (let key of allKeys) { if (accuracy[key]) this.tpl.fill(`${key}${i}`, accuracy[key].toFixed(2)) else if (speed[key]) this.tpl.fill(`${key}${i}`, convertNumber(Math.round(speed[key]))) else this.tpl.fill(`${key}${i}`, 'n/a') diff --git a/website/models/_data.json b/website/models/_data.json index d64c94074..8507a3fa1 100644 --- a/website/models/_data.json +++ b/website/models/_data.json @@ -68,6 +68,7 @@ "gpu": "words per second on GPU", "pipeline": "Processing pipeline components in order", "sources": "Sources of training data", + "vecs": "Word vectors included in the model. Models that only support context vectors compute similarity via the tensors shared with the pipeline.", "benchmark_parser": "Parser accuracy", "benchmark_ner": "NER accuracy", "benchmark_speed": "Speed" diff --git a/website/models/comparison.jade b/website/models/comparison.jade index 881a9aff4..b0ab61efe 100644 --- a/website/models/comparison.jade +++ b/website/models/comparison.jade @@ -53,6 +53,8 @@ div(data-tpl=TPL data-tpl-key="result" style="display: none") for label in ["Version", "Size", "Pipeline", "Vectors", "Sources", "Author", "License"] - var field = label.toLowerCase() + if field == "vectors" + - field = "vecs" +row +cell.u-nowrap +label=label diff --git a/website/usage/_spacy-101/_word-vectors.jade b/website/usage/_spacy-101/_word-vectors.jade index bb9add8a6..c38360014 100644 --- a/website/usage/_spacy-101/_word-vectors.jade +++ b/website/usage/_spacy-101/_word-vectors.jade @@ -4,9 +4,9 @@ p | Similarity is determined by comparing #[strong word vectors] or "word | embeddings", multi-dimensional meaning representations of a word. Word | vectors can be generated using an algorithm like - | #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's - | #[+a("/models") default models] come with - | #[strong 300-dimensional vectors] that look like this: + | #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. spaCy's medium + | #[code md] and large #[code lg] #[+a("/models") models] come with + | #[strong multi-dimensional vectors] that look like this: +code("banana.vector", false, false, 250). array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01, diff --git a/website/usage/_vectors-similarity/_basics.jade b/website/usage/_vectors-similarity/_basics.jade index 07ad6bcd4..734495c6e 100644 --- a/website/usage/_vectors-similarity/_basics.jade +++ b/website/usage/_vectors-similarity/_basics.jade @@ -4,12 +4,9 @@ | Dense, real valued vectors representing distributional similarity | information are now a cornerstone of practical NLP. The most common way | to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec] - | family of algorithms. The default - | #[+a("/models/en") English model] installs - | 300-dimensional vectors trained on the - | #[+a("http://commoncrawl.org") Common Crawl] corpus. - | If you need to train a word2vec model, we recommend the implementation in - | the Python library #[+a("https://radimrehurek.com/gensim/") Gensim]. + | family of algorithms. If you need to train a word2vec model, we recommend + | the implementation in the Python library + | #[+a("https://radimrehurek.com/gensim/") Gensim]. include ../_spacy-101/_similarity include ../_spacy-101/_word-vectors