Merge pull request #8465 from explosion/feature/spacy-package-readme

2021-06-24 13:11:08 +10:00 · 2021-06-24 13:11:08 +10:00 · a8e8d02ba7
parent 3e3d87a068 40f13c3f0c
commit a8e8d02ba7
2 changed files with 117 additions and 2 deletions
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -1,7 +1,7 @@
 from typing import Optional, Union, Any, Dict, List, Tuple
 import shutil
 from pathlib import Path
-from wasabi import Printer, get_raw_input
+from wasabi import Printer, MarkdownRenderer, get_raw_input
 import srsly
 import sys

@ -134,6 +134,11 @@ def package(
        file_path = package_path / model_name_v / file_name
        if file_path.exists():
            shutil.copy(str(file_path), str(main_path))
+    readme_path = main_path / "README.md"
+    if not readme_path.exists():
+        readme = generate_readme(meta)
+        create_file(readme_path, readme)
+        create_file(package_path / model_name_v / "README.md", readme)
    imports = []
    for code_path in code_paths:
        imports.append(code_path.stem)
@ -234,6 +239,113 @@ def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]
    return meta


+def generate_readme(meta: Dict[str, Any]) -> str:
+    """
+    Generate a Markdown-formatted README text from a model meta.json. Used
+    within the GitHub release notes and as content for README.md file added
+    to model packages.
+    """
+    md = MarkdownRenderer()
+    lang = meta["lang"]
+    name = f"{lang}_{meta['name']}"
+    version = meta["version"]
+    pipeline = ", ".join([md.code(p) for p in meta.get("pipeline", [])])
+    components = ", ".join([md.code(p) for p in meta.get("components", [])])
+    vecs = meta.get("vectors", {})
+    vectors = f"{vecs.get('keys', 0)} keys, {vecs.get('vectors', 0)} unique vectors ({ vecs.get('width', 0)} dimensions)"
+    author = meta.get("author") or "n/a"
+    notes = meta.get("notes", "")
+    license_name = meta.get("license")
+    sources = _format_sources(meta.get("sources"))
+    description = meta.get("description")
+    label_scheme = _format_label_scheme(meta.get("labels"))
+    accuracy = _format_accuracy(meta.get("performance"))
+    table_data = [
+        (md.bold("Name"), md.code(name)),
+        (md.bold("Version"), md.code(version)),
+        (md.bold("spaCy"), md.code(meta["spacy_version"])),
+        (md.bold("Default Pipeline"), pipeline),
+        (md.bold("Components"), components),
+        (md.bold("Vectors"), vectors),
+        (md.bold("Sources"), sources or "n/a"),
+        (md.bold("License"), md.code(license_name) if license_name else "n/a"),
+        (md.bold("Author"), md.link(author, meta["url"]) if "url" in meta else author),
+    ]
+    # Put together Markdown body
+    if description:
+        md.add(description)
+    md.add(md.table(table_data, ["Feature", "Description"]))
+    if label_scheme:
+        md.add(md.title(3, "Label Scheme"))
+        md.add(label_scheme)
+    if accuracy:
+        md.add(md.title(3, "Accuracy"))
+        md.add(accuracy)
+    if notes:
+        md.add(notes)
+    return md.text
+
+
+def _format_sources(data: Any) -> str:
+    if not data or not isinstance(data, list):
+        return "n/a"
+    sources = []
+    for source in data:
+        if not isinstance(source, dict):
+            source = {"name": source}
+        name = source.get("name")
+        if not name:
+            continue
+        url = source.get("url")
+        author = source.get("author")
+        result = name if not url else "[{}]({})".format(name, url)
+        if author:
+            result += " ({})".format(author)
+        sources.append(result)
+    return "<br />".join(sources)
+
+
+def _format_accuracy(data: Dict[str, Any], exclude: List[str] = ["speed"]) -> str:
+    if not data:
+        return ""
+    md = MarkdownRenderer()
+    scalars = [(k, v) for k, v in data.items() if isinstance(v, (int, float))]
+    scores = [
+        (md.code(acc.upper()), f"{score*100:.2f}")
+        for acc, score in scalars
+        if acc not in exclude
+    ]
+    md.add(md.table(scores, ["Type", "Score"]))
+    return md.text
+
+
+def _format_label_scheme(data: Dict[str, Any]) -> str:
+    if not data:
+        return ""
+    md = MarkdownRenderer()
+    n_labels = 0
+    n_pipes = 0
+    label_data = []
+    for pipe, labels in data.items():
+        if not labels:
+            continue
+        col1 = md.bold(md.code(pipe))
+        col2 = ", ".join(
+            [md.code(label.replace("|", "\|")) for label in labels]
+        )  # noqa: W605
+        label_data.append((col1, col2))
+        n_labels += len(labels)
+        n_pipes += 1
+    if not label_data:
+        return ""
+    label_info = f"View label scheme ({n_labels} labels for {n_pipes} components)"
+    md.add("<details>")
+    md.add(f"<summary>{label_info}</summary>")
+    md.add(md.table(label_data, ["Component", "Labels"]))
+    md.add("</details>")
+    return md.text
+
+
 TEMPLATE_SETUP = """
 #!/usr/bin/env python
 import io
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -933,7 +933,10 @@ copied into the package and imported in the `__init__.py`. If the path to a
 [`meta.json`](/api/data-formats#meta) is supplied, or a `meta.json` is found in
 the input directory, this file is used. Otherwise, the data can be entered
 directly from the command line. spaCy will then create a build artifact that you
-can distribute and install with `pip install`.
+can distribute and install with `pip install`. As of v3.1, the `package` command
+will also create a formatted `README.md` based on the pipeline information
+defined in the `meta.json`. If a `README.md` is already present in the source
+directory, it will be used instead.

 <Infobox title="New in v3.0" variant="warning">