diff --git a/.circleci/config.yml b/.circleci/config.yml
index 5702d0853..dbe9fa66e 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -337,7 +337,8 @@ jobs:
       - run:
           name: benchmark
           command: |
-            python benchmark/benchmark.py /usr/local/bin/python3 build/benchmarks.json
+            python benchmark/benchmark.py all --output build/benchmarks.json
+
       - store_artifacts:
           path: /root/repo/build/benchmarks.json
 
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 697a710ee..f01da15e6 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,4 @@
-exclude: (^.*patches|.*\.cgi$|^packages/micropip/src/micropip/externals|^src/py/lib/pystone.py$)
+exclude: (^.*patches|.*\.cgi$|^packages/micropip/src/micropip/externals|^benchmark/benchmarks$)
 default_language_version:
   python: "3.9"
 repos:
diff --git a/Makefile b/Makefile
index 739fd5ab5..60af672a9 100644
--- a/Makefile
+++ b/Makefile
@@ -149,7 +149,7 @@ lint:
 	pre-commit run -a --show-diff-on-failure
 
 benchmark: all
-	$(HOSTPYTHON) benchmark/benchmark.py $(HOSTPYTHON) build/benchmarks.json
+	$(HOSTPYTHON) benchmark/benchmark.py all --output build/benchmarks.json
 	$(HOSTPYTHON) benchmark/plot_benchmark.py build/benchmarks.json build/benchmarks.png
 
 
diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index 20e23ec7f..865026b63 100644
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -1,3 +1,4 @@
+import argparse
 import json
 import re
 import subprocess
@@ -5,12 +6,11 @@ import sys
 from pathlib import Path
 from time import time
 
-sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "test"))
 sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
 
 import conftest  # noqa: E402
 
-SKIP = {"fft", "hyantes", "README"}
+SKIP = {"fft", "hyantes"}
 
 
 def print_entry(name, res):
@@ -21,12 +21,13 @@ def print_entry(name, res):
     print("")
 
 
-def run_native(hostpython, code):
+def run_native(code):
     if "# non-native" in code:
         return float("NaN")
+
     root = Path(__file__).resolve().parents[1]
     output = subprocess.check_output(
-        [hostpython.resolve(), "-c", code],
+        [sys.executable, "-c", code],
         cwd=Path(__file__).resolve().parent,
         env={
             "PYTHONPATH": str(root / "src/py/lib")
@@ -45,8 +46,7 @@ def run_wasm(code, selenium, interrupt_buffer):
             pyodide.setInterruptBuffer(interrupt_buffer)
             """
         )
-    if "matplotlib" in code:
-        selenium.load_package("matplotlib")
+
     selenium.run(code)
     try:
         runtime = float(selenium.logs.split("\n")[-1])
@@ -56,12 +56,11 @@ def run_wasm(code, selenium, interrupt_buffer):
     return runtime
 
 
-def run_all(hostpython, selenium_backends, code):
-    a = run_native(hostpython, code)
-    result = {"native": a}
+def run_all(selenium_backends, code):
+    result = {"native": run_native(code)}
+
     for browser_name, selenium in selenium_backends.items():
         for interrupt_buffer in [False, True]:
-            print(f"Running with: {browser_name} {interrupt_buffer}")
             dt = run_wasm(code, selenium, interrupt_buffer)
             if interrupt_buffer:
                 browser_name += "(w/ ib)"
@@ -69,11 +68,7 @@ def run_all(hostpython, selenium_backends, code):
     return result
 
 
-def get_pystone_benchmarks():
-    yield "pystone", ("import pystone\n" "pystone.main(pystone.LOOPS)\n")
-
-
-def parse_numpy_benchmark(filename):
+def parse_benchmark(filename):
     lines = []
     with open(filename) as fp:
         for line in fp:
@@ -84,102 +79,147 @@ def parse_numpy_benchmark(filename):
     return "".join(lines)
 
 
-def get_numpy_benchmarks():
-    root = Path(__file__).resolve().parent / "benchmarks"
+def get_benchmark_scripts(scripts_dir, repeat=11, number=5):
+    root = Path(__file__).resolve().parent / scripts_dir
     for filename in sorted(root.iterdir()):
         name = filename.stem
+
         if name in SKIP:
             continue
-        if "canvas" not in str(filename) and "wasm" not in str(filename):
-            content = parse_numpy_benchmark(filename)
-            content += (
-                "import numpy as np\n"
-                "_ = np.empty(())\n"
-                "setup = setup + '\\nfrom __main__ import {}'\n"
-                "from timeit import Timer\n"
-                "t = Timer(run, setup)\n"
-                "r = t.repeat(11, 40)\n"
-                "r.remove(min(r))\n"
-                "r.remove(max(r))\n"
-                "print(np.mean(r))\n".format(name)
-            )
-            yield name, content
+
+        content = parse_benchmark(filename)
+        content += (
+            "import numpy as np\n"
+            "_ = np.empty(())\n"
+            f"setup = setup + '\\nfrom __main__ import {name}'\n"
+            "from timeit import Timer\n"
+            "t = Timer(run, setup)\n"
+            f"r = t.repeat({repeat}, {number})\n"
+            "r.remove(min(r))\n"
+            "r.remove(max(r))\n"
+            "print(np.mean(r))\n"
+        )
+
+        yield name, content
+
+
+def get_pystone_benchmarks():
+    return get_benchmark_scripts("benchmarks/pystone_benchmarks", repeat=5, number=1)
+
+
+def get_numpy_benchmarks():
+    return get_benchmark_scripts("benchmarks/numpy_benchmarks")
 
 
 def get_matplotlib_benchmarks():
-    root = Path(__file__).resolve().parent / "benchmarks"
-    for filename in sorted(root.iterdir()):
-        name = filename.stem
-        if name in SKIP:
-            continue
-        if "canvas" in str(filename) or "wasm" in str(filename):
-            content = parse_numpy_benchmark(filename)
-            content += (
-                "import numpy as np\n"
-                "_ = np.empty(())\n"
-                "setup = setup + '\\nfrom __main__ import {}'\n"
-                "from timeit import Timer\n"
-                "t = Timer(run, setup)\n"
-                "r = t.repeat(11, 20)\n"
-                "r.remove(min(r))\n"
-                "r.remove(max(r))\n"
-                "print(np.mean(r))\n".format(name)
-            )
-            yield name, content
+    return get_benchmark_scripts("benchmarks/matplotlib_benchmarks")
 
 
-def get_benchmarks():
-    yield from get_pystone_benchmarks()
-    yield from get_numpy_benchmarks()
-    yield from get_matplotlib_benchmarks()
+def get_benchmarks(benchmarks, targets=("all",)):
+    if "all" in targets:
+        for benchmark in benchmarks.values():
+            yield from benchmark()
+    else:
+        for target in targets:
+            yield from benchmarks[target]()
 
 
-def main(hostpython):
+def parse_args(benchmarks):
+    benchmarks.append("all")
+
+    parser = argparse.ArgumentParser("Run benchmarks on Pyodide's performance")
+    parser.add_argument(
+        "target",
+        choices=benchmarks,
+        nargs="+",
+        help="Benchmarks to run ('all' to run all benchmarks)",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        default="build/benchmarks.json",
+        help="path to the json file where benchmark results will be saved",
+    )
+    parser.add_argument(
+        "--timeout",
+        default=1200,
+        type=int,
+        help="Browser timeout(sec) for each benchmark (default: %(default)s)",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+
+    BENCHMARKS = {
+        "pystone": get_pystone_benchmarks,
+        "numpy": get_numpy_benchmarks,
+        "matplotlib": get_matplotlib_benchmarks,
+    }
+
+    args = parse_args(list(BENCHMARKS.keys()))
+    targets = [t.lower() for t in args.target]
+    output = Path(args.output).resolve()
+    timeout = args.timeout
+
+    results = {}
+    selenium_backends = {}
+    browser_cls = [
+        ("firefox", conftest.FirefoxWrapper),
+        ("chrome", conftest.ChromeWrapper),
+    ]
+
     with conftest.spawn_web_server() as (hostname, port, log_path):
-        results = {}
-        selenium_backends = {}
 
-        b = {"native": float("NaN")}
-        browser_cls = [
-            ("firefox", conftest.FirefoxWrapper),
-            ("chrome", conftest.ChromeWrapper),
-        ]
-        for name, cls in browser_cls:
-            t0 = time()
-            selenium_backends[name] = cls(port, script_timeout=1200)
-            b[name] = time() - t0
-            # pre-load numpy for the selenium instance used in benchmarks
-            selenium_backends[name].load_package("numpy")
-        results["selenium init"] = b
-        print_entry("selenium init", b)
+        # selenium initialization time
+        result = {"native": float("NaN")}
+        for browser_name, cls in browser_cls:
+            try:
+                t0 = time()
+                selenium = cls(port, script_timeout=timeout)
+                result[browser_name] = time() - t0
+            finally:
+                selenium.driver.quit()
 
-        # load packages
+        results["selenium init"] = result
+        print_entry("selenium init", result)
+
+        # package loading time
         for package_name in ["numpy"]:
-            b = {"native": float("NaN")}
+            result = {"native": float("NaN")}
             for browser_name, cls in browser_cls:
-                selenium = cls(port, script_timeout=1200)
+                selenium = cls(port, script_timeout=timeout)
                 try:
                     t0 = time()
                     selenium.load_package(package_name)
-                    b[browser_name] = time() - t0
+                    result[browser_name] = time() - t0
                 finally:
                     selenium.driver.quit()
-            results["load " + package_name] = b
-            print_entry("load " + package_name, b)
 
-        for name, content in get_benchmarks():
-            for browser_name, cls in browser_cls:
-                selenium_backends[browser_name].driver.quit()
-                selenium_backends[browser_name] = cls(port, script_timeout=1200)
-                selenium_backends[browser_name].load_package("numpy")
-            results[name] = run_all(hostpython, selenium_backends, content)
-            print_entry(name, results[name])
-        for selenium in selenium_backends.values():
-            selenium.driver.quit()
-    return results
+            results[f"load {package_name}"] = result
+            print_entry(f"load {package_name}", result)
+
+        # run benchmarks
+        for benchmark_name, content in get_benchmarks(BENCHMARKS, targets):
+            try:
+                # instantiate browsers for each benchmark to prevent side effects
+                for browser_name, cls in browser_cls:
+                    selenium_backends[browser_name] = cls(port, script_timeout=timeout)
+                    # pre-load numpy and matplotlib for the selenium instance used in benchmarks
+                    selenium_backends[browser_name].load_package(
+                        ["numpy", "matplotlib"]
+                    )
+
+                results[benchmark_name] = run_all(selenium_backends, content)
+                print_entry(benchmark_name, results[benchmark_name])
+            finally:
+                for selenium in selenium_backends.values():
+                    selenium.driver.quit()
+
+    output.parent.mkdir(exist_ok=True, parents=True)
+    output.write_text(json.dumps(results))
 
 
 if __name__ == "__main__":
-    results = main(Path(sys.argv[-2]).resolve())
-    with open(sys.argv[-1], "w") as fp:
-        json.dump(results, fp)
+    main()
diff --git a/benchmark/benchmarks/canvas_custom_font.py b/benchmark/benchmarks/matplotlib_benchmarks/canvas_custom_font.py
similarity index 100%
rename from benchmark/benchmarks/canvas_custom_font.py
rename to benchmark/benchmarks/matplotlib_benchmarks/canvas_custom_font.py
diff --git a/benchmark/benchmarks/canvas_image.py b/benchmark/benchmarks/matplotlib_benchmarks/canvas_image.py
similarity index 100%
rename from benchmark/benchmarks/canvas_image.py
rename to benchmark/benchmarks/matplotlib_benchmarks/canvas_image.py
diff --git a/benchmark/benchmarks/canvas_image_affine.py b/benchmark/benchmarks/matplotlib_benchmarks/canvas_image_affine.py
similarity index 100%
rename from benchmark/benchmarks/canvas_image_affine.py
rename to benchmark/benchmarks/matplotlib_benchmarks/canvas_image_affine.py
diff --git a/benchmark/benchmarks/canvas_rendering.py b/benchmark/benchmarks/matplotlib_benchmarks/canvas_rendering.py
similarity index 100%
rename from benchmark/benchmarks/canvas_rendering.py
rename to benchmark/benchmarks/matplotlib_benchmarks/canvas_rendering.py
diff --git a/benchmark/benchmarks/canvas_text_rotated.py b/benchmark/benchmarks/matplotlib_benchmarks/canvas_text_rotated.py
similarity index 100%
rename from benchmark/benchmarks/canvas_text_rotated.py
rename to benchmark/benchmarks/matplotlib_benchmarks/canvas_text_rotated.py
diff --git a/benchmark/benchmarks/wasm_custom_font.py b/benchmark/benchmarks/matplotlib_benchmarks/wasm_custom_font.py
similarity index 100%
rename from benchmark/benchmarks/wasm_custom_font.py
rename to benchmark/benchmarks/matplotlib_benchmarks/wasm_custom_font.py
diff --git a/benchmark/benchmarks/wasm_image.py b/benchmark/benchmarks/matplotlib_benchmarks/wasm_image.py
similarity index 100%
rename from benchmark/benchmarks/wasm_image.py
rename to benchmark/benchmarks/matplotlib_benchmarks/wasm_image.py
diff --git a/benchmark/benchmarks/wasm_image_affine.py b/benchmark/benchmarks/matplotlib_benchmarks/wasm_image_affine.py
similarity index 100%
rename from benchmark/benchmarks/wasm_image_affine.py
rename to benchmark/benchmarks/matplotlib_benchmarks/wasm_image_affine.py
diff --git a/benchmark/benchmarks/wasm_rendering.py b/benchmark/benchmarks/matplotlib_benchmarks/wasm_rendering.py
similarity index 100%
rename from benchmark/benchmarks/wasm_rendering.py
rename to benchmark/benchmarks/matplotlib_benchmarks/wasm_rendering.py
diff --git a/benchmark/benchmarks/wasm_text_rotated.py b/benchmark/benchmarks/matplotlib_benchmarks/wasm_text_rotated.py
similarity index 100%
rename from benchmark/benchmarks/wasm_text_rotated.py
rename to benchmark/benchmarks/matplotlib_benchmarks/wasm_text_rotated.py
diff --git a/benchmark/benchmarks/allpairs_distances.py b/benchmark/benchmarks/numpy_benchmarks/allpairs_distances.py
similarity index 100%
rename from benchmark/benchmarks/allpairs_distances.py
rename to benchmark/benchmarks/numpy_benchmarks/allpairs_distances.py
diff --git a/benchmark/benchmarks/allpairs_distances_loops.py b/benchmark/benchmarks/numpy_benchmarks/allpairs_distances_loops.py
similarity index 100%
rename from benchmark/benchmarks/allpairs_distances_loops.py
rename to benchmark/benchmarks/numpy_benchmarks/allpairs_distances_loops.py
diff --git a/benchmark/benchmarks/arc_distance.py b/benchmark/benchmarks/numpy_benchmarks/arc_distance.py
similarity index 100%
rename from benchmark/benchmarks/arc_distance.py
rename to benchmark/benchmarks/numpy_benchmarks/arc_distance.py
diff --git a/benchmark/benchmarks/check_mask.py b/benchmark/benchmarks/numpy_benchmarks/check_mask.py
similarity index 100%
rename from benchmark/benchmarks/check_mask.py
rename to benchmark/benchmarks/numpy_benchmarks/check_mask.py
diff --git a/benchmark/benchmarks/create_grid.py b/benchmark/benchmarks/numpy_benchmarks/create_grid.py
similarity index 100%
rename from benchmark/benchmarks/create_grid.py
rename to benchmark/benchmarks/numpy_benchmarks/create_grid.py
diff --git a/benchmark/benchmarks/cronbach.py b/benchmark/benchmarks/numpy_benchmarks/cronbach.py
similarity index 100%
rename from benchmark/benchmarks/cronbach.py
rename to benchmark/benchmarks/numpy_benchmarks/cronbach.py
diff --git a/benchmark/benchmarks/diffusion.py b/benchmark/benchmarks/numpy_benchmarks/diffusion.py
similarity index 100%
rename from benchmark/benchmarks/diffusion.py
rename to benchmark/benchmarks/numpy_benchmarks/diffusion.py
diff --git a/benchmark/benchmarks/evolve.py b/benchmark/benchmarks/numpy_benchmarks/evolve.py
similarity index 100%
rename from benchmark/benchmarks/evolve.py
rename to benchmark/benchmarks/numpy_benchmarks/evolve.py
diff --git a/benchmark/benchmarks/fdtd.py b/benchmark/benchmarks/numpy_benchmarks/fdtd.py
similarity index 100%
rename from benchmark/benchmarks/fdtd.py
rename to benchmark/benchmarks/numpy_benchmarks/fdtd.py
diff --git a/benchmark/benchmarks/fft.py b/benchmark/benchmarks/numpy_benchmarks/fft.py
similarity index 100%
rename from benchmark/benchmarks/fft.py
rename to benchmark/benchmarks/numpy_benchmarks/fft.py
diff --git a/benchmark/benchmarks/grayscott.py b/benchmark/benchmarks/numpy_benchmarks/grayscott.py
similarity index 100%
rename from benchmark/benchmarks/grayscott.py
rename to benchmark/benchmarks/numpy_benchmarks/grayscott.py
diff --git a/benchmark/benchmarks/grouping.py b/benchmark/benchmarks/numpy_benchmarks/grouping.py
similarity index 100%
rename from benchmark/benchmarks/grouping.py
rename to benchmark/benchmarks/numpy_benchmarks/grouping.py
diff --git a/benchmark/benchmarks/growcut.py b/benchmark/benchmarks/numpy_benchmarks/growcut.py
similarity index 100%
rename from benchmark/benchmarks/growcut.py
rename to benchmark/benchmarks/numpy_benchmarks/growcut.py
diff --git a/benchmark/benchmarks/harris.py b/benchmark/benchmarks/numpy_benchmarks/harris.py
similarity index 100%
rename from benchmark/benchmarks/harris.py
rename to benchmark/benchmarks/numpy_benchmarks/harris.py
diff --git a/benchmark/benchmarks/hasting.py b/benchmark/benchmarks/numpy_benchmarks/hasting.py
similarity index 100%
rename from benchmark/benchmarks/hasting.py
rename to benchmark/benchmarks/numpy_benchmarks/hasting.py
diff --git a/benchmark/benchmarks/hyantes.py b/benchmark/benchmarks/numpy_benchmarks/hyantes.py
similarity index 100%
rename from benchmark/benchmarks/hyantes.py
rename to benchmark/benchmarks/numpy_benchmarks/hyantes.py
diff --git a/benchmark/benchmarks/julia.py b/benchmark/benchmarks/numpy_benchmarks/julia.py
similarity index 100%
rename from benchmark/benchmarks/julia.py
rename to benchmark/benchmarks/numpy_benchmarks/julia.py
diff --git a/benchmark/benchmarks/l2norm.py b/benchmark/benchmarks/numpy_benchmarks/l2norm.py
similarity index 100%
rename from benchmark/benchmarks/l2norm.py
rename to benchmark/benchmarks/numpy_benchmarks/l2norm.py
diff --git a/benchmark/benchmarks/large_decimal_list.py b/benchmark/benchmarks/numpy_benchmarks/large_decimal_list.py
similarity index 100%
rename from benchmark/benchmarks/large_decimal_list.py
rename to benchmark/benchmarks/numpy_benchmarks/large_decimal_list.py
diff --git a/benchmark/benchmarks/local_maxima.py b/benchmark/benchmarks/numpy_benchmarks/local_maxima.py
similarity index 100%
rename from benchmark/benchmarks/local_maxima.py
rename to benchmark/benchmarks/numpy_benchmarks/local_maxima.py
diff --git a/benchmark/benchmarks/log_likelihood.py b/benchmark/benchmarks/numpy_benchmarks/log_likelihood.py
similarity index 100%
rename from benchmark/benchmarks/log_likelihood.py
rename to benchmark/benchmarks/numpy_benchmarks/log_likelihood.py
diff --git a/benchmark/benchmarks/lstsqr.py b/benchmark/benchmarks/numpy_benchmarks/lstsqr.py
similarity index 100%
rename from benchmark/benchmarks/lstsqr.py
rename to benchmark/benchmarks/numpy_benchmarks/lstsqr.py
diff --git a/benchmark/benchmarks/mandel.py b/benchmark/benchmarks/numpy_benchmarks/mandel.py
similarity index 100%
rename from benchmark/benchmarks/mandel.py
rename to benchmark/benchmarks/numpy_benchmarks/mandel.py
diff --git a/benchmark/benchmarks/multiple_sum.py b/benchmark/benchmarks/numpy_benchmarks/multiple_sum.py
similarity index 100%
rename from benchmark/benchmarks/multiple_sum.py
rename to benchmark/benchmarks/numpy_benchmarks/multiple_sum.py
diff --git a/benchmark/benchmarks/pairwise_loop.py b/benchmark/benchmarks/numpy_benchmarks/pairwise_loop.py
similarity index 100%
rename from benchmark/benchmarks/pairwise_loop.py
rename to benchmark/benchmarks/numpy_benchmarks/pairwise_loop.py
diff --git a/benchmark/benchmarks/periodic_dist.py b/benchmark/benchmarks/numpy_benchmarks/periodic_dist.py
similarity index 100%
rename from benchmark/benchmarks/periodic_dist.py
rename to benchmark/benchmarks/numpy_benchmarks/periodic_dist.py
diff --git a/benchmark/benchmarks/repeating.py b/benchmark/benchmarks/numpy_benchmarks/repeating.py
similarity index 100%
rename from benchmark/benchmarks/repeating.py
rename to benchmark/benchmarks/numpy_benchmarks/repeating.py
diff --git a/benchmark/benchmarks/reverse_cumsum.py b/benchmark/benchmarks/numpy_benchmarks/reverse_cumsum.py
similarity index 100%
rename from benchmark/benchmarks/reverse_cumsum.py
rename to benchmark/benchmarks/numpy_benchmarks/reverse_cumsum.py
diff --git a/benchmark/benchmarks/rosen.py b/benchmark/benchmarks/numpy_benchmarks/rosen.py
similarity index 100%
rename from benchmark/benchmarks/rosen.py
rename to benchmark/benchmarks/numpy_benchmarks/rosen.py
diff --git a/benchmark/benchmarks/slowparts.py b/benchmark/benchmarks/numpy_benchmarks/slowparts.py
similarity index 100%
rename from benchmark/benchmarks/slowparts.py
rename to benchmark/benchmarks/numpy_benchmarks/slowparts.py
diff --git a/benchmark/benchmarks/smoothing.py b/benchmark/benchmarks/numpy_benchmarks/smoothing.py
similarity index 100%
rename from benchmark/benchmarks/smoothing.py
rename to benchmark/benchmarks/numpy_benchmarks/smoothing.py
diff --git a/benchmark/benchmarks/specialconvolve.py b/benchmark/benchmarks/numpy_benchmarks/specialconvolve.py
similarity index 100%
rename from benchmark/benchmarks/specialconvolve.py
rename to benchmark/benchmarks/numpy_benchmarks/specialconvolve.py
diff --git a/benchmark/benchmarks/vibr_energy.py b/benchmark/benchmarks/numpy_benchmarks/vibr_energy.py
similarity index 100%
rename from benchmark/benchmarks/vibr_energy.py
rename to benchmark/benchmarks/numpy_benchmarks/vibr_energy.py
diff --git a/benchmark/benchmarks/wave.py b/benchmark/benchmarks/numpy_benchmarks/wave.py
similarity index 100%
rename from benchmark/benchmarks/wave.py
rename to benchmark/benchmarks/numpy_benchmarks/wave.py
diff --git a/src/py/lib/pystone.py b/benchmark/benchmarks/pystone_benchmarks/pystone.py
old mode 100755
new mode 100644
similarity index 93%
rename from src/py/lib/pystone.py
rename to benchmark/benchmarks/pystone_benchmarks/pystone.py
index f30f5870c..5b125462d
--- a/src/py/lib/pystone.py
+++ b/benchmark/benchmarks/pystone_benchmarks/pystone.py
@@ -1,6 +1,9 @@
 #! /usr/bin/env python
 # flake8: noqa
 
+# setup: pass
+# run: pystone()
+
 """
 "PYSTONE" Benchmark Program
 
@@ -279,22 +282,5 @@ def Func3(EnumParIn):
     return FALSE
 
 
-if __name__ == "__main__":
-    import sys
-
-    def error(msg):
-        print(msg, end=" ", file=sys.stderr)
-        print("usage: %s [number_of_loops]" % sys.argv[0], file=sys.stderr)
-        sys.exit(100)
-
-    nargs = len(sys.argv) - 1
-    if nargs > 1:
-        error("%d arguments are too many;" % nargs)
-    elif nargs == 1:
-        try:
-            loops = int(sys.argv[1])
-        except ValueError:
-            error("Invalid argument %r;" % sys.argv[1])
-    else:
-        loops = LOOPS
-    main(loops)
+def pystone():
+    main(LOOPS)
diff --git a/docs/development/testing.md b/docs/development/testing.md
index dc9172b80..dfa1548d8 100644
--- a/docs/development/testing.md
+++ b/docs/development/testing.md
@@ -76,7 +76,7 @@ To run common benchmarks to understand Pyodide's performance, begin by
 installing the same prerequisites as for testing. Then run:
 
 ```bash
-make benchmark
+PYODIDE_PACKAGES="numpy,matplotlib" make benchmark
 ```
 
 ## Linting