mirror of https://github.com/explosion/spaCy.git
Update init config and recommendations
- As much as I dislike YAML, it seemed like a better format here because it allows us to add comments if we want to explain the different recommendations - Don't include the generated JS in the repo by default and build it on the fly when running or deploying the site. This ensures it's always up to date. - Simplify jinja_to_js script and use fewer dependencies
This commit is contained in:
parent
225f8866a1
commit
e2f2ef3a5a
|
@ -20,6 +20,7 @@ website/logs
|
||||||
npm-debug.log*
|
npm-debug.log*
|
||||||
website/www/
|
website/www/
|
||||||
website/_deploy.sh
|
website/_deploy.sh
|
||||||
|
quickstart-training-generator.js
|
||||||
|
|
||||||
# Cython / C extensions
|
# Cython / C extensions
|
||||||
cythonize.json
|
cythonize.json
|
||||||
|
|
|
@ -3,17 +3,17 @@ from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer, diff_strings
|
from wasabi import Printer, diff_strings
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
from pydantic import BaseModel
|
|
||||||
import srsly
|
import srsly
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
from ..schemas import RecommendationSchema
|
||||||
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
|
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
|
||||||
|
|
||||||
|
|
||||||
TEMPLATE_ROOT = Path(__file__).parent / "templates"
|
ROOT = Path(__file__).parent / "templates"
|
||||||
TEMPLATE_PATH = TEMPLATE_ROOT / "quickstart_training.jinja"
|
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
|
||||||
RECOMMENDATIONS_PATH = TEMPLATE_ROOT / "quickstart_training_recommendations.json"
|
RECOMMENDATIONS = srsly.read_yaml(ROOT / "quickstart_training_recommendations.yml")
|
||||||
|
|
||||||
|
|
||||||
class Optimizations(str, Enum):
|
class Optimizations(str, Enum):
|
||||||
|
@ -21,21 +21,6 @@ class Optimizations(str, Enum):
|
||||||
accuracy = "accuracy"
|
accuracy = "accuracy"
|
||||||
|
|
||||||
|
|
||||||
class RecommendationsTrfItem(BaseModel):
|
|
||||||
name: str
|
|
||||||
size_factor: int
|
|
||||||
|
|
||||||
|
|
||||||
class RecommendationsTrf(BaseModel):
|
|
||||||
efficiency: RecommendationsTrfItem
|
|
||||||
accuracy: RecommendationsTrfItem
|
|
||||||
|
|
||||||
|
|
||||||
class RecommendationSchema(BaseModel):
|
|
||||||
word_vectors: Optional[str] = None
|
|
||||||
transformer: Optional[RecommendationsTrf] = None
|
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command("config")
|
@init_cli.command("config")
|
||||||
def init_config_cli(
|
def init_config_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
@ -111,14 +96,11 @@ def init_config(
|
||||||
from jinja2 import Template
|
from jinja2 import Template
|
||||||
except ImportError:
|
except ImportError:
|
||||||
msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
|
msg.fail("This command requires jinja2", "pip install jinja2", exits=1)
|
||||||
recommendations = srsly.read_json(RECOMMENDATIONS_PATH)
|
|
||||||
lang_defaults = util.get_lang_class(lang).Defaults
|
|
||||||
has_letters = lang_defaults.writing_system.get("has_letters", True)
|
|
||||||
# Filter out duplicates since tok2vec and transformer are added by template
|
|
||||||
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
|
|
||||||
reco = RecommendationSchema(**recommendations.get(lang, {})).dict()
|
|
||||||
with TEMPLATE_PATH.open("r") as f:
|
with TEMPLATE_PATH.open("r") as f:
|
||||||
template = Template(f.read())
|
template = Template(f.read())
|
||||||
|
# Filter out duplicates since tok2vec and transformer are added by template
|
||||||
|
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
|
||||||
|
reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, {})).dict()
|
||||||
variables = {
|
variables = {
|
||||||
"lang": lang,
|
"lang": lang,
|
||||||
"components": pipeline,
|
"components": pipeline,
|
||||||
|
@ -126,7 +108,7 @@ def init_config(
|
||||||
"hardware": "cpu" if cpu else "gpu",
|
"hardware": "cpu" if cpu else "gpu",
|
||||||
"transformer_data": reco["transformer"],
|
"transformer_data": reco["transformer"],
|
||||||
"word_vectors": reco["word_vectors"],
|
"word_vectors": reco["word_vectors"],
|
||||||
"has_letters": has_letters,
|
"has_letters": reco["has_letters"],
|
||||||
}
|
}
|
||||||
base_template = template.render(variables).strip()
|
base_template = template.render(variables).strip()
|
||||||
# Giving up on getting the newlines right in jinja for now
|
# Giving up on getting the newlines right in jinja for now
|
||||||
|
|
|
@ -1,13 +0,0 @@
|
||||||
{
|
|
||||||
"en": {
|
|
||||||
"word_vectors": "en_vectors_web_lg",
|
|
||||||
"transformer": {
|
|
||||||
"efficiency": { "name": "roberta-base", "size_factor": 3 },
|
|
||||||
"accuracy": { "name": "roberta-base", "size_factor": 3 }
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"de": {
|
|
||||||
"word_vectors": null,
|
|
||||||
"transformer": null
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -0,0 +1,103 @@
|
||||||
|
# Recommended settings and available resources for each language, if available.
|
||||||
|
# Not all languages have recommended word vecotrs or transformers and for some,
|
||||||
|
# the recommended transformer for efficiency and accuracy may be the same.
|
||||||
|
en:
|
||||||
|
word_vectors: en_vectors_web_lg
|
||||||
|
transformer:
|
||||||
|
efficiency:
|
||||||
|
name: roberta-base
|
||||||
|
size_factor: 3
|
||||||
|
accuracy:
|
||||||
|
name: roberta-base
|
||||||
|
size_factor: 3
|
||||||
|
de:
|
||||||
|
word_vectors: null
|
||||||
|
transformer:
|
||||||
|
efficiency:
|
||||||
|
name: bert-base-german-cased
|
||||||
|
size_factor: 3
|
||||||
|
accuracy:
|
||||||
|
name: bert-base-german-cased
|
||||||
|
size_factor: 3
|
||||||
|
fr:
|
||||||
|
word_vectors: null
|
||||||
|
transformer:
|
||||||
|
efficiency:
|
||||||
|
name: camembert-base
|
||||||
|
size_factor: 3
|
||||||
|
accuracy:
|
||||||
|
name: camembert-base
|
||||||
|
size_factor: 3
|
||||||
|
es:
|
||||||
|
word_vectors: null
|
||||||
|
transformer:
|
||||||
|
efficiency:
|
||||||
|
name: mrm8488/RuPERTa-base
|
||||||
|
size_factor: 3
|
||||||
|
accuracy:
|
||||||
|
name: mrm8488/RuPERTa-base
|
||||||
|
size_factor: 3
|
||||||
|
sv:
|
||||||
|
word_vectors: null
|
||||||
|
transformer:
|
||||||
|
efficiency:
|
||||||
|
name: KB/bert-base-swedish-cased
|
||||||
|
size_factor: 3
|
||||||
|
accuracy:
|
||||||
|
name: KB/bert-base-swedish-cased
|
||||||
|
size_factor: 3
|
||||||
|
fi:
|
||||||
|
word_vectors: null
|
||||||
|
transformer:
|
||||||
|
efficiency:
|
||||||
|
name: TurkuNLP/bert-base-finnish-cased-v1
|
||||||
|
size_factor: 3
|
||||||
|
accuracy:
|
||||||
|
name: TurkuNLP/bert-base-finnish-cased-v1
|
||||||
|
size_factor: 3
|
||||||
|
el:
|
||||||
|
word_vectors: null
|
||||||
|
transformer:
|
||||||
|
efficiency:
|
||||||
|
name: nlpaueb/bert-base-greek-uncased-v1
|
||||||
|
size_factor: 3
|
||||||
|
accuracy:
|
||||||
|
name: nlpaueb/bert-base-greek-uncased-v1
|
||||||
|
size_factor: 3
|
||||||
|
tr:
|
||||||
|
word_vectors: null
|
||||||
|
transformer:
|
||||||
|
efficiency:
|
||||||
|
name: dbmdz/bert-base-turkish-cased
|
||||||
|
size_factor: 3
|
||||||
|
accuracy:
|
||||||
|
name: dbmdz/bert-base-turkish-cased
|
||||||
|
size_factor: 3
|
||||||
|
zh:
|
||||||
|
word_vectors: null
|
||||||
|
transformer:
|
||||||
|
efficiency:
|
||||||
|
name: bert-base-chinese
|
||||||
|
size_factor: 3
|
||||||
|
accuracy:
|
||||||
|
name: bert-base-chinese
|
||||||
|
size_factor: 3
|
||||||
|
has_letters: false
|
||||||
|
ar:
|
||||||
|
word_vectors: null
|
||||||
|
transformer:
|
||||||
|
efficiency:
|
||||||
|
name: asafaya/bert-base-arabic
|
||||||
|
size_factor: 3
|
||||||
|
accuracy:
|
||||||
|
name: asafaya/bert-base-arabic
|
||||||
|
size_factor: 3
|
||||||
|
pl:
|
||||||
|
word_vectors: null
|
||||||
|
transformer:
|
||||||
|
efficiency:
|
||||||
|
name: dkleczek/bert-base-polish-cased-v1
|
||||||
|
size_factor: 3
|
||||||
|
accuracy:
|
||||||
|
name: dkleczek/bert-base-polish-cased-v1
|
||||||
|
size_factor: 3
|
|
@ -311,3 +311,22 @@ class ProjectConfigSchema(BaseModel):
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
title = "Schema for project configuration file"
|
title = "Schema for project configuration file"
|
||||||
|
|
||||||
|
|
||||||
|
# Recommendations for init config workflows
|
||||||
|
|
||||||
|
|
||||||
|
class RecommendationTrfItem(BaseModel):
|
||||||
|
name: str
|
||||||
|
size_factor: int
|
||||||
|
|
||||||
|
|
||||||
|
class RecommendationTrf(BaseModel):
|
||||||
|
efficiency: RecommendationTrfItem
|
||||||
|
accuracy: RecommendationTrfItem
|
||||||
|
|
||||||
|
|
||||||
|
class RecommendationSchema(BaseModel):
|
||||||
|
word_vectors: Optional[str] = None
|
||||||
|
transformer: Optional[RecommendationTrf] = None
|
||||||
|
has_letters: bool = True
|
||||||
|
|
|
@ -2,10 +2,9 @@ import pytest
|
||||||
from spacy.gold import docs_to_json, biluo_tags_from_offsets
|
from spacy.gold import docs_to_json, biluo_tags_from_offsets
|
||||||
from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
|
from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.schemas import ProjectConfigSchema, validate
|
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
||||||
from spacy.cli.pretrain import make_docs
|
from spacy.cli.pretrain import make_docs
|
||||||
from spacy.cli.init_config import init_config, RECOMMENDATIONS_PATH
|
from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
||||||
from spacy.cli.init_config import RecommendationSchema
|
|
||||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||||
from spacy.util import get_lang_class
|
from spacy.util import get_lang_class
|
||||||
import srsly
|
import srsly
|
||||||
|
@ -335,7 +334,5 @@ def test_init_config(lang, pipeline, optimize):
|
||||||
|
|
||||||
|
|
||||||
def test_model_recommendations():
|
def test_model_recommendations():
|
||||||
recommendations = srsly.read_json(RECOMMENDATIONS_PATH)
|
for lang, data in RECOMMENDATIONS.items():
|
||||||
for lang, data in recommendations.items():
|
|
||||||
assert get_lang_class(lang)
|
|
||||||
assert RecommendationSchema(**data)
|
assert RecommendationSchema(**data)
|
||||||
|
|
|
@ -53,7 +53,7 @@
|
||||||
"remark-react": "^5.0.1"
|
"remark-react": "^5.0.1"
|
||||||
},
|
},
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"build": "npm run python:setup && gatsby build",
|
"build": "npm run python:install && npm run python:setup && gatsby build",
|
||||||
"dev": "npm run python:setup && gatsby develop",
|
"dev": "npm run python:setup && gatsby develop",
|
||||||
"dev:nightly": "BRANCH=nightly.spacy.io npm run dev",
|
"dev:nightly": "BRANCH=nightly.spacy.io npm run dev",
|
||||||
"lint": "eslint **",
|
"lint": "eslint **",
|
||||||
|
|
|
@ -11,7 +11,8 @@ from os import path
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
from jinja2 import Environment, FileSystemLoader, nodes
|
from jinja2 import Environment, FileSystemLoader, nodes
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import typer
|
import srsly
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
OPERANDS = {
|
OPERANDS = {
|
||||||
|
@ -437,7 +438,8 @@ class JinjaToJS(object):
|
||||||
with self._interpolation():
|
with self._interpolation():
|
||||||
with self._python_bool_wrapper(**kwargs):
|
with self._python_bool_wrapper(**kwargs):
|
||||||
if node.items:
|
if node.items:
|
||||||
raise ValueError(f"Can't process non-empty dict in epxression: {node}")
|
err = f"Can't process non-empty dict in expression: {node}"
|
||||||
|
raise ValueError(err)
|
||||||
self.output.write("{}")
|
self.output.write("{}")
|
||||||
|
|
||||||
def _process_getattr(self, node, **kwargs):
|
def _process_getattr(self, node, **kwargs):
|
||||||
|
@ -1232,18 +1234,22 @@ class JinjaToJS(object):
|
||||||
self.output.write(")")
|
self.output.write(")")
|
||||||
|
|
||||||
|
|
||||||
def main(
|
def main(template_path, output=None, data_path=None):
|
||||||
# fmt: off
|
"""Convert a jinja2 template to a JavaScript module.
|
||||||
template_path: Path = typer.Argument(..., exists=True, dir_okay=False, help="Path to .jinja file"),
|
|
||||||
output: Path = typer.Argument(None, help="Path to output module (stdout if unset)"),
|
template_path (Path): Path to .jijna file.
|
||||||
data_path: Path = typer.Option(None, "--data", help="Optional JSON file with additional data to be included as DATA")
|
output (Optional[Path]): Path to output .js module (stdout if unset).
|
||||||
# fmt: on
|
data_path (Optional[Path]): Optional JSON or YAML file with additional data
|
||||||
):
|
to be included in the JS module as the exported variable DATA.
|
||||||
"""Convert a jinja2 template to a JavaScript module."""
|
"""
|
||||||
data = "{}"
|
data = "{}"
|
||||||
if data_path is not None:
|
if data_path is not None:
|
||||||
with data_path.open("r", encoding="utf8") as f:
|
if data_path.suffix in (".yml", ".yaml"):
|
||||||
data = json.dumps(json.loads(f.read())) # dump and load for compactness
|
data = srsly.read_yaml(data_path)
|
||||||
|
else:
|
||||||
|
data = srsly.read_json(data_path)
|
||||||
|
data = srsly.json_dumps(data) # dump and load for compactness
|
||||||
|
template_path = Path(template_path)
|
||||||
tpl_file = template_path.parts[-1]
|
tpl_file = template_path.parts[-1]
|
||||||
compiler = JinjaToJS(template_path.parent, tpl_file, js_module_format="es6")
|
compiler = JinjaToJS(template_path.parent, tpl_file, js_module_format="es6")
|
||||||
header = f"// This file was auto-generated by {__file__} based on {tpl_file}"
|
header = f"// This file was auto-generated by {__file__} based on {tpl_file}"
|
||||||
|
@ -1258,4 +1264,10 @@ def main(
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
typer.run(main)
|
args = sys.argv[1:]
|
||||||
|
if not len(args):
|
||||||
|
raise ValueError("Need at least one argument: path to .jinja template")
|
||||||
|
template_path = Path(args[0])
|
||||||
|
output = Path(args[1]) if len(args) > 1 else None
|
||||||
|
data_path = Path(args[2]) if len(args) > 2 else None
|
||||||
|
main(template_path, output, data_path)
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
# These are used to compile the training quickstart config
|
# These are used to compile the training quickstart config
|
||||||
jinja2
|
jinja2
|
||||||
typer
|
srsly
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
python jinja_to_js.py ../../spacy/cli/templates/quickstart_training.jinja ../src/widgets/quickstart-training-generator.js --data ../../spacy/cli/templates/quickstart_training_recommendations.json
|
python jinja_to_js.py ../../spacy/cli/templates/quickstart_training.jinja ../src/widgets/quickstart-training-generator.js ../../spacy/cli/templates/quickstart_training_recommendations.yml
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -4,7 +4,7 @@ import highlightCode from 'gatsby-remark-prismjs/highlight-code.js'
|
||||||
|
|
||||||
import { Quickstart } from '../components/quickstart'
|
import { Quickstart } from '../components/quickstart'
|
||||||
import generator, { DATA as GENERATOR_DATA } from './quickstart-training-generator'
|
import generator, { DATA as GENERATOR_DATA } from './quickstart-training-generator'
|
||||||
import { isString, htmlToReact } from '../components/util'
|
import { htmlToReact } from '../components/util'
|
||||||
|
|
||||||
const DEFAULT_LANG = 'en'
|
const DEFAULT_LANG = 'en'
|
||||||
const DEFAULT_HARDWARE = 'gpu'
|
const DEFAULT_HARDWARE = 'gpu'
|
||||||
|
@ -47,13 +47,6 @@ const DATA = [
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
function stringify(value) {
|
|
||||||
if (isString(value) && value.startsWith('${')) return value
|
|
||||||
const string = JSON.stringify(value)
|
|
||||||
if (Array.isArray(value)) return string.replace(/,/g, ', ')
|
|
||||||
return string
|
|
||||||
}
|
|
||||||
|
|
||||||
export default function QuickstartTraining({ id, title, download = 'config.cfg' }) {
|
export default function QuickstartTraining({ id, title, download = 'config.cfg' }) {
|
||||||
const [lang, setLang] = useState(DEFAULT_LANG)
|
const [lang, setLang] = useState(DEFAULT_LANG)
|
||||||
const [components, setComponents] = useState([])
|
const [components, setComponents] = useState([])
|
||||||
|
@ -73,6 +66,7 @@ export default function QuickstartTraining({ id, title, download = 'config.cfg'
|
||||||
hardware,
|
hardware,
|
||||||
transformer_data: reco.transformer,
|
transformer_data: reco.transformer,
|
||||||
word_vectors: reco.word_vectors,
|
word_vectors: reco.word_vectors,
|
||||||
|
has_letters: reco.has_letters,
|
||||||
})
|
})
|
||||||
const rawStr = content.trim().replace(/\n\n\n+/g, '\n\n')
|
const rawStr = content.trim().replace(/\n\n\n+/g, '\n\n')
|
||||||
const rawContent = `${COMMENT}\n${rawStr}`
|
const rawContent = `${COMMENT}\n${rawStr}`
|
||||||
|
@ -90,7 +84,7 @@ export default function QuickstartTraining({ id, title, download = 'config.cfg'
|
||||||
id: code,
|
id: code,
|
||||||
title: name,
|
title: name,
|
||||||
}))
|
}))
|
||||||
.sort((a, b) => a.id.localeCompare(b.id))
|
.sort((a, b) => a.title.localeCompare(b.title))
|
||||||
return (
|
return (
|
||||||
<Quickstart
|
<Quickstart
|
||||||
download={download}
|
download={download}
|
||||||
|
|
Loading…
Reference in New Issue