"""Orchestrate the full Lagoon benchmark suite and produce a Markdown summary. Runs, in order: 1. bench_ingest.py (keeps the namespace for later phases) 2. bench_recall.py (ANN vs exact recall@k) 3. bench_latency.py (warm phase) 4. bench_cache.py (hit rates with a skewed hot set) 5. Renders results/SUMMARY.md from the per-benchmark JSON files. The cold-latency phase cannot be fully automated from inside this process because a *true* cold measurement requires restarting the API server (or clearing its local cache directory) between samples. This script prints exact instructions for collecting cold samples and merges any results/latency_cold.json it finds into the summary. Usage: python run_all.py --docs 50000 --dim 256 """ from __future__ import annotations import argparse import json import os import subprocess import sys from typing import Optional HERE = os.path.dirname(os.path.abspath(__file__)) def run(script: str, *args: str) -> None: cmd = [sys.executable, os.path.join(HERE, script), *args] print(f"\n$ {' '.join(cmd)}") subprocess.run(cmd, check=True) def load(out_dir: str, name: str) -> Optional[dict]: path = os.path.join(out_dir, f"{name}.json") if not os.path.exists(path): return None with open(path, encoding="utf-8") as f: return json.load(f) def fmt_latency(block: Optional[dict]) -> str: if not block: return "—" s = block.get("client_latency", block) if not s or s.get("count", 0) == 0: return "—" return (f"p50 {s['p50_ms']} ms · p90 {s['p90_ms']} ms · " f"p99 {s['p99_ms']} ms (n={s['count']})") def render_summary(out_dir: str) -> str: ingest = load(out_dir, "ingest") recall = load(out_dir, "recall") warm = load(out_dir, "latency_warm") cold = load(out_dir, "latency_cold") cache = load(out_dir, "cache") lines = [ "# Lagoon benchmark results", "", "> Generated by `benchmarks/run_all.py`. Read", "> `docs/benchmark-guide.md` for methodology and reporting policy", "> before quoting any number from this file.", "", ] if ingest: env = ingest.get("environment", {}) lines += [ "## Environment", "", f"- Timestamp (UTC): {env.get('timestamp_utc', '?')}", f"- Host: {env.get('hostname', '?')} — {env.get('platform', '?')}", f"- CPUs: {env.get('cpu_count', '?')}", f"- Server URL: {env.get('lagoon_url', '?')}", "- Storage backend: **FILL IN** (filesystem / MinIO / S3 + region)", "- Server build: **FILL IN** (git SHA, release/debug)", "", ] if ingest: r, p = ingest["results"], ingest["params"] lines += [ "## Ingest throughput", "", f"- Dataset: {p['docs']:,} docs, {p['dim']}-d vectors, " f"batch size {p['batch_size']}, concurrency {p['concurrency']}", f"- Throughput: **{r['docs_per_second']:,} docs/s** " f"({r['payload_mb_per_second']} MB/s JSON payload)", f"- Per-batch latency: {fmt_latency({'client_latency': r['batch_latency']})}", f"- Index catch-up after last write: {r['index_catchup_seconds']} s " f"(caught up: {r['index_caught_up']})", "", ] if recall: r, p = recall["results"], recall["params"] rec_items = [f"{k} = **{v}**" for k, v in r.items() if k.startswith("recall@")] lines += [ "## ANN recall vs exact kNN", "", f"- {p['queries']} queries, top_k={p['top_k']}, metric={p['metric']}", "- " + " · ".join(rec_items), f"- Exact latency: {fmt_latency({'client_latency': r['exact_latency']})}", f"- ANN latency: {fmt_latency({'client_latency': r['ann_latency']})}", f"- Mean speedup (exact/ANN): {r.get('speedup_mean', '—')}×", "", ] lines += ["## Query latency", ""] if cold: r = cold["results"] lines += [ f"- **Cold** (first query after restart/cache clear): " f"{r.get('first_query_cold_ms', '—')} ms " f"(single sample — collect more by re-running the cold phase)", ] else: lines += [ "- **Cold**: not collected. Restart the server (or clear its cache " "dir), then run `python bench_latency.py --phase cold " "--namespace bench-ingest --dim ` and regenerate this summary.", ] if warm: r = warm["results"] for wl in ("vector_ann", "vector_exact", "bm25", "hybrid_rrf", "filtered_vector"): if wl in r: lines.append(f"- **Warm {wl}**: {fmt_latency(r[wl])}") lines.append("") if cache: r = cache["results"] lines += [ "## Cache hit rates", "", f"- Workload: {r['queries_issued']} queries, " f"{r['distinct_queries']} distinct", f"- Overall hit rate: **{r.get('overall_hit_rate', '—')}**", f"- Hits by tier: `{json.dumps(r.get('cache_hits_by_tier', {}))}`", f"- Misses by tier: `{json.dumps(r.get('cache_misses_by_tier', {}))}`", f"- Object-store GETs during workload: {r.get('object_store_gets', '—')}; " f"PUTs: {r.get('object_store_puts', '—')}", "", ] if "warning" in r: lines += [f"> ⚠️ {r['warning']}", ""] lines += [ "## Reporting policy", "", "These numbers describe **this build on this machine with this " "dataset** only. Do not extrapolate to other hardware, datasets, or " "products, and do not present them as comparisons with proprietary " "systems that were not measured under identical conditions. See " "`docs/benchmark-guide.md` § Honest reporting.", "", ] return "\n".join(lines) def main() -> None: ap = argparse.ArgumentParser(description="Run the full Lagoon benchmark suite") ap.add_argument("--docs", type=int, default=50_000) ap.add_argument("--dim", type=int, default=256) ap.add_argument("--batch-size", type=int, default=500) ap.add_argument("--queries", type=int, default=200) ap.add_argument("--recall-queries", type=int, default=100) ap.add_argument("--seed", type=int, default=42) ap.add_argument("--namespace", default="bench-ingest") ap.add_argument("--out-dir", default="results") ap.add_argument("--summary-only", action="store_true", help="Skip running benchmarks; just re-render SUMMARY.md " "from existing JSON results.") args = ap.parse_args() if not args.summary_only: run("bench_ingest.py", "--docs", str(args.docs), "--dim", str(args.dim), "--batch-size", str(args.batch_size), "--seed", str(args.seed), "--namespace", args.namespace, "--keep-namespace", "--out-dir", args.out_dir) run("bench_recall.py", "--namespace", args.namespace, "--dim", str(args.dim), "--queries", str(args.recall_queries), "--top-k", "100", "--seed", str(args.seed), "--out-dir", args.out_dir) run("bench_latency.py", "--namespace", args.namespace, "--dim", str(args.dim), "--queries", str(args.queries), "--phase", "warm", "--seed", str(args.seed), "--out-dir", args.out_dir) run("bench_cache.py", "--namespace", args.namespace, "--dim", str(args.dim), "--queries", str(args.queries), "--hot-set", "20", "--seed", str(args.seed), "--out-dir", args.out_dir) summary = render_summary(args.out_dir) os.makedirs(args.out_dir, exist_ok=True) out_path = os.path.join(args.out_dir, "SUMMARY.md") with open(out_path, "w", encoding="utf-8") as f: f.write(summary) print(f"\nWrote {out_path}") print( "\nTo add COLD latency samples:\n" " 1. Restart the Lagoon server (or delete its local cache directory).\n" f" 2. python bench_latency.py --phase cold --namespace {args.namespace} " f"--dim {args.dim} --out-dir {args.out_dir}\n" " 3. python run_all.py --summary-only\n" f"\nWhen finished, free storage with: " f"lagoon namespace delete {args.namespace} (or via the API)." ) if __name__ == "__main__": main()