From 319eb508b59cd1b51f354a25fb0e88457424a259 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 12 Jan 2023 11:55:21 +0100
Subject: [PATCH] Add a `spacy benchmark speed` subcommand (#11902)

* Add a `spacy evaluate speed` subcommand

This subcommand reports the mean batch performance of a model on a data set with
a 95% confidence interval. For reliability, it first performs some warmup
rounds. Then it will measure performance on batches with randomly shuffled
documents.

To avoid having too many spaCy commands, `speed` is a subcommand of `evaluate`
and accuracy evaluation is moved to its own `evaluate accuracy` subcommand.

* Fix import cycle

* Restore `spacy evaluate`, make `spacy benchmark speed` an alias

* Add documentation for `spacy benchmark`

* CREATES -> PRINTS

* WPS -> words/s

* Disable formatting of benchmark speed arguments

* Fail with an error message when trying to speed bench empty corpus

* Make it clearer that `benchmark accuracy` is a replacement for `evaluate`

* Fix docstring webpage reference

* tests: check `evaluate` output against `benchmark accuracy`
---
 spacy/cli/__init__.py        |   1 +
 spacy/cli/_util.py           |   3 +
 spacy/cli/benchmark_speed.py | 174 +++++++++++++++++++++++++++++++++++
 spacy/cli/evaluate.py        |   7 +-
 spacy/tests/test_cli_app.py  |   9 ++
 website/docs/api/cli.mdx     |  44 ++++++++-
 6 files changed, 231 insertions(+), 7 deletions(-)
 create mode 100644 spacy/cli/benchmark_speed.py

diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index aabd1cfef..868526b42 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -4,6 +4,7 @@ from ._util import app, setup_cli  # noqa: F401
 
 # These are the actual functions, NOT the wrapped CLI commands. The CLI commands
 # are registered automatically and won't have to be imported here.
+from .benchmark_speed import benchmark_speed_cli  # noqa: F401
 from .download import download  # noqa: F401
 from .info import info  # noqa: F401
 from .package import package  # noqa: F401
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 0f4e9f599..ba3892b1d 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -46,6 +46,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
 commands to check and validate your config files, training and evaluation data,
 and custom model implementations.
 """
+BENCHMARK_HELP = """Commands for benchmarking pipelines."""
 INIT_HELP = """Commands for initializing configs and pipeline packages."""
 
 # Wrappers for Typer's annotations. Initially created to set defaults and to
@@ -54,12 +55,14 @@ Arg = typer.Argument
 Opt = typer.Option
 
 app = typer.Typer(name=NAME, help=HELP)
+benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
 project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
 debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
 init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
 
 app.add_typer(project_cli)
 app.add_typer(debug_cli)
+app.add_typer(benchmark_cli)
 app.add_typer(init_cli)
 
 
diff --git a/spacy/cli/benchmark_speed.py b/spacy/cli/benchmark_speed.py
new file mode 100644
index 000000000..4eb20a5fa
--- /dev/null
+++ b/spacy/cli/benchmark_speed.py
@@ -0,0 +1,174 @@
+from typing import Iterable, List, Optional
+import random
+from itertools import islice
+import numpy
+from pathlib import Path
+import time
+from tqdm import tqdm
+import typer
+from wasabi import msg
+
+from .. import util
+from ..language import Language
+from ..tokens import Doc
+from ..training import Corpus
+from ._util import Arg, Opt, benchmark_cli, setup_gpu
+
+
+@benchmark_cli.command(
+    "speed",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def benchmark_speed_cli(
+    # fmt: off
+    ctx: typer.Context,
+    model: str = Arg(..., help="Model name or path"),
+    data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
+    batch_size: Optional[int] = Opt(None, "--batch-size", "-b", min=1, help="Override the pipeline batch size"),
+    no_shuffle: bool = Opt(False, "--no-shuffle", help="Do not shuffle benchmark data"),
+    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
+    n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
+    warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
+    # fmt: on
+):
+    """
+    Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
+    data in the binary .spacy format.
+    """
+    setup_gpu(use_gpu=use_gpu, silent=False)
+
+    nlp = util.load_model(model)
+    batch_size = batch_size if batch_size is not None else nlp.batch_size
+    corpus = Corpus(data_path)
+    docs = [eg.predicted for eg in corpus(nlp)]
+
+    if len(docs) == 0:
+        msg.fail("Cannot benchmark speed using an empty corpus.", exits=1)
+
+    print(f"Warming up for {warmup_epochs} epochs...")
+    warmup(nlp, docs, warmup_epochs, batch_size)
+
+    print()
+    print(f"Benchmarking {n_batches} batches...")
+    wps = benchmark(nlp, docs, n_batches, batch_size, not no_shuffle)
+
+    print()
+    print_outliers(wps)
+    print_mean_with_ci(wps)
+
+
+# Lowercased, behaves as a context manager function.
+class time_context:
+    """Register the running time of a context."""
+
+    def __enter__(self):
+        self.start = time.perf_counter()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.elapsed = time.perf_counter() - self.start
+
+
+class Quartiles:
+    """Calculate the q1, q2, q3 quartiles and the inter-quartile range (iqr)
+    of a sample."""
+
+    q1: float
+    q2: float
+    q3: float
+    iqr: float
+
+    def __init__(self, sample: numpy.ndarray) -> None:
+        self.q1 = numpy.quantile(sample, 0.25)
+        self.q2 = numpy.quantile(sample, 0.5)
+        self.q3 = numpy.quantile(sample, 0.75)
+        self.iqr = self.q3 - self.q1
+
+
+def annotate(
+    nlp: Language, docs: List[Doc], batch_size: Optional[int]
+) -> numpy.ndarray:
+    docs = nlp.pipe(tqdm(docs, unit="doc"), batch_size=batch_size)
+    wps = []
+    while True:
+        with time_context() as elapsed:
+            batch_docs = list(
+                islice(docs, batch_size if batch_size else nlp.batch_size)
+            )
+        if len(batch_docs) == 0:
+            break
+        n_tokens = count_tokens(batch_docs)
+        wps.append(n_tokens / elapsed.elapsed)
+
+    return numpy.array(wps)
+
+
+def benchmark(
+    nlp: Language,
+    docs: List[Doc],
+    n_batches: int,
+    batch_size: int,
+    shuffle: bool,
+) -> numpy.ndarray:
+    if shuffle:
+        bench_docs = [
+            nlp.make_doc(random.choice(docs).text)
+            for _ in range(n_batches * batch_size)
+        ]
+    else:
+        bench_docs = [
+            nlp.make_doc(docs[i % len(docs)].text)
+            for i in range(n_batches * batch_size)
+        ]
+
+    return annotate(nlp, bench_docs, batch_size)
+
+
+def bootstrap(x, statistic=numpy.mean, iterations=10000) -> numpy.ndarray:
+    """Apply a statistic to repeated random samples of an array."""
+    return numpy.fromiter(
+        (
+            statistic(numpy.random.choice(x, len(x), replace=True))
+            for _ in range(iterations)
+        ),
+        numpy.float64,
+    )
+
+
+def count_tokens(docs: Iterable[Doc]) -> int:
+    return sum(len(doc) for doc in docs)
+
+
+def print_mean_with_ci(sample: numpy.ndarray):
+    mean = numpy.mean(sample)
+    bootstrap_means = bootstrap(sample)
+    bootstrap_means.sort()
+
+    # 95% confidence interval
+    low = bootstrap_means[int(len(bootstrap_means) * 0.025)]
+    high = bootstrap_means[int(len(bootstrap_means) * 0.975)]
+
+    print(f"Mean: {mean:.1f} words/s (95% CI: {low-mean:.1f} +{high-mean:.1f})")
+
+
+def print_outliers(sample: numpy.ndarray):
+    quartiles = Quartiles(sample)
+
+    n_outliers = numpy.sum(
+        (sample < (quartiles.q1 - 1.5 * quartiles.iqr))
+        | (sample > (quartiles.q3 + 1.5 * quartiles.iqr))
+    )
+    n_extreme_outliers = numpy.sum(
+        (sample < (quartiles.q1 - 3.0 * quartiles.iqr))
+        | (sample > (quartiles.q3 + 3.0 * quartiles.iqr))
+    )
+    print(
+        f"Outliers: {(100 * n_outliers) / len(sample):.1f}%, extreme outliers: {(100 * n_extreme_outliers) / len(sample)}%"
+    )
+
+
+def warmup(
+    nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
+) -> numpy.ndarray:
+    docs = warmup_epochs * docs
+    return annotate(nlp, docs, batch_size)
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 0d08d2c5e..8f3d6b859 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -7,12 +7,15 @@ from thinc.api import fix_random_seed
 
 from ..training import Corpus
 from ..tokens import Doc
-from ._util import app, Arg, Opt, setup_gpu, import_code
+from ._util import app, Arg, Opt, setup_gpu, import_code, benchmark_cli
 from ..scorer import Scorer
 from .. import util
 from .. import displacy
 
 
+@benchmark_cli.command(
+    "accuracy",
+)
 @app.command("evaluate")
 def evaluate_cli(
     # fmt: off
@@ -36,7 +39,7 @@ def evaluate_cli(
     dependency parses in a HTML file, set as output directory as the
     displacy_path argument.
 
-    DOCS: https://spacy.io/api/cli#evaluate
+    DOCS: https://spacy.io/api/cli#benchmark-accuracy
     """
     import_code(code_path)
     evaluate(
diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py
index 873a3ff66..84b2b8d4d 100644
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@@ -31,3 +31,12 @@ def test_convert_auto_conflict():
         assert "All input files must be same type" in result.stdout
         out_files = os.listdir(d_out)
         assert len(out_files) == 0
+
+
+def test_benchmark_accuracy_alias():
+    # Verify that the `evaluate` alias works correctly.
+    result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
+    result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
+    assert result_benchmark.stdout == result_evaluate.stdout.replace(
+        "spacy evaluate", "spacy benchmark accuracy"
+    )
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 8b84a02ff..5f10c8bc0 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -12,6 +12,7 @@ menu:
   - ['train', 'train']
   - ['pretrain', 'pretrain']
   - ['evaluate', 'evaluate']
+  - ['benchmark', 'benchmark']
   - ['apply', 'apply']
   - ['find-threshold', 'find-threshold']
   - ['assemble', 'assemble']
@@ -1135,8 +1136,19 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [
 
 ## evaluate {id="evaluate",version="2",tag="command"}
 
-Evaluate a trained pipeline. Expects a loadable spaCy pipeline (package name or
-path) and evaluation data in the
+The `evaluate` subcommand is superseded by
+[`spacy benchmark accuracy`](#benchmark-accuracy). `evaluate` is provided as an
+alias to `benchmark accuracy` for compatibility.
+
+## benchmark {id="benchmark", version="3.5"}
+
+The `spacy benchmark` CLI includes commands for benchmarking the accuracy and
+speed of your spaCy pipelines.
+
+### accuracy {id="benchmark-accuracy", version="3.5", tag="command"}
+
+Evaluate the accuracy of a trained pipeline. Expects a loadable spaCy pipeline
+(package name or path) and evaluation data in the
 [binary `.spacy` format](/api/data-formats#binary-training). The
 `--gold-preproc` option sets up the evaluation examples with gold-standard
 sentences and tokens for the predictions. Gold preprocessing helps the
@@ -1147,7 +1159,7 @@ skew. To render a sample of dependency parses in a HTML file using the
 `--displacy-path` argument.
 
 ```bash
-$ python -m spacy evaluate [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit]
+$ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit]
 ```
 
 | Name                                      | Description                                                                                                                                                                          |
@@ -1163,6 +1175,29 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--code] [--gold-prepr
 | `--help`, `-h`                            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 | **CREATES**                               | Training results and optional metrics and visualizations.                                                                                                                            |
 
+### speed {id="benchmark-speed", version="3.5", tag="command"}
+
+Benchmark the speed of a trained pipeline with a 95% confidence interval.
+Expects a loadable spaCy pipeline (package name or path) and benchmark data in
+the [binary `.spacy` format](/api/data-formats#binary-training). The pipeline is
+warmed up before any measurements are taken.
+
+```cli
+$ python -m spacy benchmark speed [model] [data_path] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup]
+```
+
+| Name                 | Description                                                                                              |
+| -------------------- | -------------------------------------------------------------------------------------------------------- |
+| `model`              | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~ |
+| `data_path`          | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
+| `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~           |
+| `--no-shuffle`       | Do not shuffle documents in the benchmark data. ~~bool (flag)~~                                          |
+| `--gpu-id`, `-g`     | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                           |
+| `--batches`          | Number of batches to benchmark on. Defaults to `50`. ~~Optional[int] \(option)~~                         |
+| `--warmup`, `-w`     | Iterations over the benchmark data for warmup. Defaults to `3` ~~Optional[int] \(option)~~               |
+| `--help`, `-h`       | Show help message and available arguments. ~~bool (flag)~~                                               |
+| **PRINTS**           | Pipeline speed in words per second with a 95% confidence interval.                                       |
+
 ## apply {id="apply", version="3.5", tag="command"}
 
 Applies a trained pipeline to data and stores the resulting annotated documents
@@ -1176,7 +1211,7 @@ input formats are:
 
 When a directory is provided it is traversed recursively to collect all files.
 
-```cli
+```bash
 $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
 ```
 
@@ -1194,7 +1229,6 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key]
 | `--help`, `-h`                            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 | **CREATES**                               | A `DocBin` with the annotations from the `model` for all the files found in `data-path`.                                                                                             |
 
-
 ## find-threshold {id="find-threshold",version="3.5",tag="command"}
 
 Runs prediction trials for a trained model with varying tresholds to maximize