diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 86c4a990d..1d0b9d401 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -1,7 +1,7 @@ from typing import Optional, Union, Any, Dict, List, Tuple import shutil from pathlib import Path -from wasabi import Printer, get_raw_input +from wasabi import Printer, MarkdownRenderer, get_raw_input import srsly import sys @@ -134,6 +134,11 @@ def package( file_path = package_path / model_name_v / file_name if file_path.exists(): shutil.copy(str(file_path), str(main_path)) + readme_path = main_path / "README.md" + if not readme_path.exists(): + readme = generate_readme(meta) + create_file(readme_path, readme) + create_file(package_path / model_name_v / "README.md", readme) imports = [] for code_path in code_paths: imports.append(code_path.stem) @@ -234,6 +239,113 @@ def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any] return meta +def generate_readme(meta: Dict[str, Any]) -> str: + """ + Generate a Markdown-formatted README text from a model meta.json. Used + within the GitHub release notes and as content for README.md file added + to model packages. + """ + md = MarkdownRenderer() + lang = meta["lang"] + name = f"{lang}_{meta['name']}" + version = meta["version"] + pipeline = ", ".join([md.code(p) for p in meta.get("pipeline", [])]) + components = ", ".join([md.code(p) for p in meta.get("components", [])]) + vecs = meta.get("vectors", {}) + vectors = f"{vecs.get('keys', 0)} keys, {vecs.get('vectors', 0)} unique vectors ({ vecs.get('width', 0)} dimensions)" + author = meta.get("author") or "n/a" + notes = meta.get("notes", "") + license_name = meta.get("license") + sources = _format_sources(meta.get("sources")) + description = meta.get("description") + label_scheme = _format_label_scheme(meta.get("labels")) + accuracy = _format_accuracy(meta.get("performance")) + table_data = [ + (md.bold("Name"), md.code(name)), + (md.bold("Version"), md.code(version)), + (md.bold("spaCy"), md.code(meta["spacy_version"])), + (md.bold("Default Pipeline"), pipeline), + (md.bold("Components"), components), + (md.bold("Vectors"), vectors), + (md.bold("Sources"), sources or "n/a"), + (md.bold("License"), md.code(license_name) if license_name else "n/a"), + (md.bold("Author"), md.link(author, meta["url"]) if "url" in meta else author), + ] + # Put together Markdown body + if description: + md.add(description) + md.add(md.table(table_data, ["Feature", "Description"])) + if label_scheme: + md.add(md.title(3, "Label Scheme")) + md.add(label_scheme) + if accuracy: + md.add(md.title(3, "Accuracy")) + md.add(accuracy) + if notes: + md.add(notes) + return md.text + + +def _format_sources(data: Any) -> str: + if not data or not isinstance(data, list): + return "n/a" + sources = [] + for source in data: + if not isinstance(source, dict): + source = {"name": source} + name = source.get("name") + if not name: + continue + url = source.get("url") + author = source.get("author") + result = name if not url else "[{}]({})".format(name, url) + if author: + result += " ({})".format(author) + sources.append(result) + return "
".join(sources) + + +def _format_accuracy(data: Dict[str, Any], exclude: List[str] = ["speed"]) -> str: + if not data: + return "" + md = MarkdownRenderer() + scalars = [(k, v) for k, v in data.items() if isinstance(v, (int, float))] + scores = [ + (md.code(acc.upper()), f"{score*100:.2f}") + for acc, score in scalars + if acc not in exclude + ] + md.add(md.table(scores, ["Type", "Score"])) + return md.text + + +def _format_label_scheme(data: Dict[str, Any]) -> str: + if not data: + return "" + md = MarkdownRenderer() + n_labels = 0 + n_pipes = 0 + label_data = [] + for pipe, labels in data.items(): + if not labels: + continue + col1 = md.bold(md.code(pipe)) + col2 = ", ".join( + [md.code(label.replace("|", "\|")) for label in labels] + ) # noqa: W605 + label_data.append((col1, col2)) + n_labels += len(labels) + n_pipes += 1 + if not label_data: + return "" + label_info = f"View label scheme ({n_labels} labels for {n_pipes} components)" + md.add("
") + md.add(f"{label_info}") + md.add(md.table(label_data, ["Component", "Labels"])) + md.add("
") + return md.text + + TEMPLATE_SETUP = """ #!/usr/bin/env python import io diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 685f998ff..5dfe567b3 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -933,7 +933,10 @@ copied into the package and imported in the `__init__.py`. If the path to a [`meta.json`](/api/data-formats#meta) is supplied, or a `meta.json` is found in the input directory, this file is used. Otherwise, the data can be entered directly from the command line. spaCy will then create a build artifact that you -can distribute and install with `pip install`. +can distribute and install with `pip install`. As of v3.1, the `package` command +will also create a formatted `README.md` based on the pipeline information +defined in the `meta.json`. If a `README.md` is already present in the source +directory, it will be used instead.