#!/usr/bin/env python3 """Ingest the bundled article corpus into LagoonDB for the semantic-search demo. Usage: python demos/semantic-search/ingest.py LAGOON_EMBEDDINGS=sentence-transformers python demos/semantic-search/ingest.py python demos/semantic-search/ingest.py --namespace my-articles --recreate """ from __future__ import annotations import argparse import json import sys import time from pathlib import Path # Allow running from the repo root or from this directory. sys.path.insert(0, str(Path(__file__).resolve().parents[2])) from demos.common.embeddings import get_provider # noqa: E402 from demos.common.lagoon_client import LagoonClient # noqa: E402 DATA_FILE = Path(__file__).resolve().parent / "data" / "articles.jsonl" DEFAULT_NAMESPACE = "demo-articles" def load_articles() -> list[dict]: articles = [] with DATA_FILE.open("r", encoding="utf-8") as fh: for line in fh: line = line.strip() if line: articles.append(json.loads(line)) return articles def main() -> None: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--namespace", default=DEFAULT_NAMESPACE) parser.add_argument( "--provider", default=None, help="embedding provider: hash | sentence-transformers | openai " "(default: $LAGOON_EMBEDDINGS or 'hash')", ) parser.add_argument( "--recreate", action="store_true", help="drop and recreate the namespace before ingesting", ) args = parser.parse_args() provider = get_provider(args.provider) client = LagoonClient() articles = load_articles() print(f"Loaded {len(articles)} articles from {DATA_FILE.name}") print(f"Embedding provider: {provider.name} ({provider.dims} dims)") if args.recreate: client.delete_namespace(args.namespace) client.create_namespace( args.namespace, schema={ "vector": {"dims": provider.dims, "metric": "cosine"}, "full_text": ["title", "body"], "filterable": ["category", "year"], # Record the embedding space so queries can verify compatibility. "annotations": {"embedding_provider": provider.name}, }, ) # Embed title + body together so titles contribute to the vector. texts = [f"{a['title']}. {a['body']}" for a in articles] t0 = time.time() vectors = provider.embed(texts) print(f"Embedded {len(texts)} documents in {time.time() - t0:.2f}s") documents = [ { "id": article["id"], "vector": vector, "attributes": { "title": article["title"], "body": article["body"], "category": article["category"], "year": article["year"], }, } for article, vector in zip(articles, vectors) ] t0 = time.time() written = client.upsert(args.namespace, documents, batch_size=100) print(f"Upserted {written} documents in {time.time() - t0:.2f}s (WAL durable)") ns = client.wait_for_indexing(args.namespace) stats = ns.get("stats", {}) print( "Background indexing complete: " f"{stats.get('document_count', written)} docs, " f"{stats.get('segment_count', '?')} segments" ) print(f"\nReady. Try:\n python demos/semantic-search/search.py " f'"why do coral reefs matter" --namespace {args.namespace}') if __name__ == "__main__": main()