"""Deterministic synthetic dataset generator for the Lagoon benchmark suite. Generates documents with: * dense vectors drawn from a Gaussian mixture (clustered, so IVF-style ANN indexes behave realistically — uniform random vectors make ANN look artificially easy or hard depending on dimension); * text fields built from a fixed vocabulary with a Zipf-like frequency distribution (so BM25 has realistic term statistics); * filterable attributes (category, year, price, in_stock). Everything is seeded, so two runs with the same parameters produce identical datasets — required for comparing results across machines or versions. Usage: python datagen.py --docs 50000 --dim 256 --out data/bench-50k.jsonl """ from __future__ import annotations import argparse import json import os from typing import Iterator, List, Tuple import numpy as np # A fixed 256-word vocabulary. Words are ordinary English nouns/adjectives; # the generator samples them with Zipf weights so a few words are very common # and the tail is rare, mimicking natural-language term distribution. VOCAB: List[str] = [ "harbor", "lantern", "granite", "meadow", "copper", "willow", "ember", "compass", "orchard", "thicket", "ledger", "anchor", "breeze", "canyon", "drift", "estuary", "fjord", "glacier", "hollow", "inlet", "juniper", "kelp", "lagoon", "marsh", "nectar", "oasis", "pebble", "quarry", "ridge", "summit", "tundra", "valley", "wharf", "yarrow", "zephyr", "basalt", "cedar", "delta", "ebb", "flint", "grove", "heath", "isle", "jetty", "knoll", "loch", "mesa", "narrows", "outcrop", "prairie", "quay", "reef", "shoal", "terrace", "upland", "vista", "weir", "atoll", "bluff", "cairn", "dune", "eddy", "ford", "gully", "haven", "islet", "jasper", "karst", "levee", "moor", "notch", "oxbow", "pass", "quicksand", "rapids", "strand", "trench", "undertow", "verge", "wash", "arroyo", "bayou", "cove", "divide", "escarpment", "floe", "geyser", "headland", "iceberg", "jungle", "key", "lowland", "moraine", "nook", "overhang", "plateau", "ravine", "spit", "tarn", "uplift", "vent", "waterfall", "ancient", "bright", "calm", "deep", "early", "faint", "gentle", "hidden", "inner", "jagged", "keen", "low", "misty", "narrow", "open", "pale", "quiet", "rough", "steep", "tall", "upper", "vast", "wide", "amber", "blue", "crimson", "dark", "emerald", "frost", "golden", "hazel", "ivory", "jade", "khaki", "lilac", "maroon", "navy", "olive", "pearl", "quartz", "russet", "silver", "teal", "umber", "violet", "white", "azure", "bronze", "coral", "dusty", "ecru", "fawn", "gray", "hoary", "indigo", "jet", "lemon", "mauve", "noir", "ochre", "plum", "rose", "sable", "tan", "ultramarine", "verdant", "wheat", "alder", "birch", "chestnut", "dogwood", "elm", "fir", "ginkgo", "hawthorn", "ironwood", "joshua", "katsura", "larch", "magnolia", "nutmeg", "oak", "pine", "quince", "redwood", "spruce", "tamarack", "umbrella", "vine", "walnut", "yew", "aspen", "beech", "cypress", "driftwood", "eucalyptus", "fern", "gorse", "holly", "ivy", "jacaranda", "kapok", "laurel", "maple", "nettle", "olive_tree", "poplar", "reed", "sequoia", "teak", "underbrush", "veld", "wisteria", "acacia", "bamboo", "clover", "daffodil", "edelweiss", "foxglove", "geranium", "heather", "iris", "jasmine", "kudzu", "lavender", "marigold", "narcissus", "orchid", "peony", "ranunculus", "saffron", "thistle", "umbel", "verbena", "wallflower", "yucca", "zinnia", "alpine", "boreal", "coastal", "desert", "estuarine", "fluvial", "glacial", "highland", "insular", "jagged_peak", "karstic", "littoral", "montane", "nival", "oceanic", "pelagic", "riparian", "subalpine", "tidal", "undersea", "volcanic", "wetland", "arid", "brackish", "cold", "damp", "elevated", "frozen", ] CATEGORIES = ["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta"] def make_mixture( rng: np.random.Generator, dim: int, n_clusters: int ) -> Tuple[np.ndarray, np.ndarray]: """Cluster centers and per-cluster scales.""" centers = rng.normal(0.0, 1.0, size=(n_clusters, dim)).astype(np.float32) scales = rng.uniform(0.05, 0.25, size=n_clusters).astype(np.float32) return centers, scales def generate_vectors( n: int, dim: int, seed: int, n_clusters: int = 64 ) -> np.ndarray: rng = np.random.default_rng(seed) centers, scales = make_mixture(rng, dim, n_clusters) assignments = rng.integers(0, n_clusters, size=n) noise = rng.normal(0.0, 1.0, size=(n, dim)).astype(np.float32) vecs = centers[assignments] + noise * scales[assignments][:, None] # L2-normalise so cosine and dot product are well-behaved. norms = np.linalg.norm(vecs, axis=1, keepdims=True) norms[norms == 0] = 1.0 return (vecs / norms).astype(np.float32) def zipf_weights(n_words: int, exponent: float = 1.1) -> np.ndarray: ranks = np.arange(1, n_words + 1, dtype=np.float64) w = 1.0 / np.power(ranks, exponent) return w / w.sum() def generate_text(rng: np.random.Generator, weights: np.ndarray, n_words: int) -> str: idx = rng.choice(len(VOCAB), size=n_words, p=weights) return " ".join(VOCAB[i] for i in idx) def generate_documents( n_docs: int, dim: int, seed: int = 42 ) -> Iterator[dict]: rng = np.random.default_rng(seed + 1) weights = zipf_weights(len(VOCAB)) vectors = generate_vectors(n_docs, dim, seed) for i in range(n_docs): title_len = int(rng.integers(3, 8)) body_len = int(rng.integers(30, 120)) yield { "id": f"doc-{i:08d}", "vector": [round(float(x), 6) for x in vectors[i]], "fields": { "title": generate_text(rng, weights, title_len), "body": generate_text(rng, weights, body_len), }, "attributes": { "category": CATEGORIES[int(rng.integers(0, len(CATEGORIES)))], "year": int(rng.integers(2000, 2025)), "price": round(float(rng.uniform(1.0, 500.0)), 2), "in_stock": bool(rng.integers(0, 2)), }, } def generate_query_vectors(n: int, dim: int, seed: int = 42) -> np.ndarray: """Query vectors drawn from the same mixture (different noise seed) so they have realistic proximity to the corpus.""" return generate_vectors(n, dim, seed + 7919) def generate_query_terms(n: int, seed: int = 42, words_per_query: int = 3) -> List[str]: rng = np.random.default_rng(seed + 104729) weights = zipf_weights(len(VOCAB)) return [generate_text(rng, weights, words_per_query) for _ in range(n)] def main() -> None: ap = argparse.ArgumentParser(description="Generate a benchmark dataset.") ap.add_argument("--docs", type=int, default=50_000) ap.add_argument("--dim", type=int, default=256) ap.add_argument("--seed", type=int, default=42) ap.add_argument("--out", default="data/bench.jsonl") args = ap.parse_args() os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True) n_written = 0 with open(args.out, "w", encoding="utf-8") as f: for doc in generate_documents(args.docs, args.dim, args.seed): f.write(json.dumps(doc) + "\n") n_written += 1 if n_written % 10_000 == 0: print(f" wrote {n_written}/{args.docs}") print(f"Wrote {n_written} documents ({args.dim}-d vectors) to {args.out}") if __name__ == "__main__": main()