From 2fbd43c6034059a79ac858308d153645945dcb6f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 16 Oct 2020 08:17:53 +0200 Subject: [PATCH 1/2] Use core lg models as vectors models in quickstart --- .../quickstart_training_recommendations.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/spacy/cli/templates/quickstart_training_recommendations.yml b/spacy/cli/templates/quickstart_training_recommendations.yml index 54aec2e31..09cc3a624 100644 --- a/spacy/cli/templates/quickstart_training_recommendations.yml +++ b/spacy/cli/templates/quickstart_training_recommendations.yml @@ -2,7 +2,7 @@ # Not all languages have recommended word vectors or transformers and for some, # the recommended transformer for efficiency and accuracy may be the same. en: - word_vectors: en_vectors_web_lg + word_vectors: en_core_web_lg transformer: efficiency: name: roberta-base @@ -11,7 +11,7 @@ en: name: roberta-base size_factor: 3 de: - word_vectors: null + word_vectors: de_core_news_lg transformer: efficiency: name: bert-base-german-cased @@ -20,7 +20,7 @@ de: name: bert-base-german-cased size_factor: 3 fr: - word_vectors: null + word_vectors: fr_core_news_lg transformer: efficiency: name: camembert-base @@ -29,7 +29,7 @@ fr: name: camembert-base size_factor: 3 es: - word_vectors: null + word_vectors: es_core_news_lg transformer: efficiency: name: dccuchile/bert-base-spanish-wwm-cased @@ -56,7 +56,7 @@ fi: name: TurkuNLP/bert-base-finnish-cased-v1 size_factor: 3 el: - word_vectors: null + word_vectors: el_core_news_lg transformer: efficiency: name: nlpaueb/bert-base-greek-uncased-v1 @@ -74,7 +74,7 @@ tr: name: dbmdz/bert-base-turkish-cased size_factor: 3 zh: - word_vectors: null + word_vectors: zh_core_web_lg transformer: efficiency: name: bert-base-chinese @@ -93,7 +93,7 @@ ar: name: asafaya/bert-base-arabic size_factor: 3 pl: - word_vectors: null + word_vectors: pl_core_news_lg transformer: efficiency: name: dkleczek/bert-base-polish-cased-v1 @@ -102,7 +102,7 @@ pl: name: dkleczek/bert-base-polish-cased-v1 size_factor: 3 nl: - word_vectors: null + word_vectors: nl_core_news_lg transformer: efficiency: name: pdelobelle/robbert-v2-dutch-base @@ -111,7 +111,7 @@ nl: name: pdelobelle/robbert-v2-dutch-base size_factor: 3 pt: - word_vectors: null + word_vectors: pt_core_news_lg transformer: efficiency: name: neuralmind/bert-base-portuguese-cased From c8d04b79e279353341d966fd0b2d096ff4fe5dfe Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 16 Oct 2020 08:25:16 +0200 Subject: [PATCH 2/2] Sort and add vectors for langs without transformers --- .../quickstart_training_recommendations.yml | 132 ++++++++++-------- 1 file changed, 75 insertions(+), 57 deletions(-) diff --git a/spacy/cli/templates/quickstart_training_recommendations.yml b/spacy/cli/templates/quickstart_training_recommendations.yml index 09cc3a624..47b3abbf6 100644 --- a/spacy/cli/templates/quickstart_training_recommendations.yml +++ b/spacy/cli/templates/quickstart_training_recommendations.yml @@ -1,15 +1,18 @@ # Recommended settings and available resources for each language, if available. # Not all languages have recommended word vectors or transformers and for some, # the recommended transformer for efficiency and accuracy may be the same. -en: - word_vectors: en_core_web_lg +ar: + word_vectors: null transformer: efficiency: - name: roberta-base + name: asafaya/bert-base-arabic size_factor: 3 accuracy: - name: roberta-base + name: asafaya/bert-base-arabic size_factor: 3 +da: + word_vectors: da_core_news_lg + transformer: null de: word_vectors: de_core_news_lg transformer: @@ -19,14 +22,23 @@ de: accuracy: name: bert-base-german-cased size_factor: 3 -fr: - word_vectors: fr_core_news_lg +el: + word_vectors: el_core_news_lg transformer: efficiency: - name: camembert-base + name: nlpaueb/bert-base-greek-uncased-v1 size_factor: 3 accuracy: - name: camembert-base + name: nlpaueb/bert-base-greek-uncased-v1 + size_factor: 3 +en: + word_vectors: en_core_web_lg + transformer: + efficiency: + name: roberta-base + size_factor: 3 + accuracy: + name: roberta-base size_factor: 3 es: word_vectors: es_core_news_lg @@ -37,15 +49,6 @@ es: accuracy: name: dccuchile/bert-base-spanish-wwm-cased size_factor: 3 -sv: - word_vectors: null - transformer: - efficiency: - name: KB/bert-base-swedish-cased - size_factor: 3 - accuracy: - name: KB/bert-base-swedish-cased - size_factor: 3 fi: word_vectors: null transformer: @@ -55,14 +58,65 @@ fi: accuracy: name: TurkuNLP/bert-base-finnish-cased-v1 size_factor: 3 -el: - word_vectors: el_core_news_lg +fr: + word_vectors: fr_core_news_lg transformer: efficiency: - name: nlpaueb/bert-base-greek-uncased-v1 + name: camembert-base size_factor: 3 accuracy: - name: nlpaueb/bert-base-greek-uncased-v1 + name: camembert-base + size_factor: 3 +it: + word_vectors: it_core_news_lg + transformers: null +ja: + word_vectors: ja_core_news_lg + transformers: null +lt: + word_vectors: lt_core_news_lg + transformers: null +nb: + word_vectors: nb_core_news_lg + transformers: null +nl: + word_vectors: nl_core_news_lg + transformer: + efficiency: + name: pdelobelle/robbert-v2-dutch-base + size_factor: 3 + accuracy: + name: pdelobelle/robbert-v2-dutch-base + size_factor: 3 +pl: + word_vectors: pl_core_news_lg + transformer: + efficiency: + name: dkleczek/bert-base-polish-cased-v1 + size_factor: 3 + accuracy: + name: dkleczek/bert-base-polish-cased-v1 + size_factor: 3 +pt: + word_vectors: pt_core_news_lg + transformer: + efficiency: + name: neuralmind/bert-base-portuguese-cased + size_factor: 3 + accuracy: + name: neuralmind/bert-base-portuguese-cased + size_factor: 3 +ro: + word_vectors: ro_core_news_lg + transformers: null +sv: + word_vectors: null + transformer: + efficiency: + name: KB/bert-base-swedish-cased + size_factor: 3 + accuracy: + name: KB/bert-base-swedish-cased size_factor: 3 tr: word_vectors: null @@ -83,39 +137,3 @@ zh: name: bert-base-chinese size_factor: 3 has_letters: false -ar: - word_vectors: null - transformer: - efficiency: - name: asafaya/bert-base-arabic - size_factor: 3 - accuracy: - name: asafaya/bert-base-arabic - size_factor: 3 -pl: - word_vectors: pl_core_news_lg - transformer: - efficiency: - name: dkleczek/bert-base-polish-cased-v1 - size_factor: 3 - accuracy: - name: dkleczek/bert-base-polish-cased-v1 - size_factor: 3 -nl: - word_vectors: nl_core_news_lg - transformer: - efficiency: - name: pdelobelle/robbert-v2-dutch-base - size_factor: 3 - accuracy: - name: pdelobelle/robbert-v2-dutch-base - size_factor: 3 -pt: - word_vectors: pt_core_news_lg - transformer: - efficiency: - name: neuralmind/bert-base-portuguese-cased - size_factor: 3 - accuracy: - name: neuralmind/bert-base-portuguese-cased - size_factor: 3