"""Golden-file confidence regression harness. The harness runs the full default-deriver pipeline over the committed sample corpus and snapshots a map of "subject|predicate|canonical-json(value)" -> confidence (rounded) for every active claim. The snapshot is compared against a committed baseline at ``tests/regression/golden/confidence.golden.json``. Workflow: * Generate / regenerate the baseline: ``python scripts/update_goldens.py`` (or run pytest with ``MNEMA_UPDATE_GOLDEN=1``), then commit the file. * If no baseline exists yet, the comparison test SKIPS with instructions rather than asserting against fabricated numbers. * Structural tests (determinism, unit-interval bounds, key uniqueness) always run regardless of the baseline. Any confidence drift beyond TOLERANCE, or any change in the derived claim set, fails with a readable diff so calibration changes are made deliberately, not accidentally. """ from __future__ import annotations import json import math import os from pathlib import Path import pytest from mnema.core.keys import KeyPair from mnema.core.log import OperationLog from mnema.derive.engine import DerivationEngine from mnema.derive.loaders import load_calendar, load_notes, load_photos REPO_ROOT = Path(__file__).resolve().parents[2] SAMPLES_DIR = REPO_ROOT / "data" / "samples" GOLDEN_PATH = Path(__file__).resolve().parent / "golden" / "confidence.golden.json" TOLERANCE = 1e-6 UPDATE_ENV = "MNEMA_UPDATE_GOLDEN" def _load_corpus(): evidence = [] evidence += load_calendar(SAMPLES_DIR / "calendar.sample.jsonl") evidence += load_notes(SAMPLES_DIR / "notes.sample.jsonl") evidence += load_photos(SAMPLES_DIR / "photos.sample.jsonl") assert evidence, "sample corpus must not be empty" return evidence def _claim_key(claim) -> str: return "|".join( [ str(claim.subject), str(claim.predicate), json.dumps(claim.value, sort_keys=True, default=str), ] ) def compute_snapshot(workdir: Path) -> dict: """Run the default pipeline over the sample corpus; return the claim-key -> confidence map. Keys are content-based (subject / predicate / value), so the snapshot is independent of signing keys and operation ids.""" workdir.mkdir(parents=True, exist_ok=True) log = OperationLog(workdir / "ops.jsonl") engine = DerivationEngine(log=log, keypair=KeyPair.generate()) engine.ingest(_load_corpus()) engine.run() engine.run() # settle any higher-order derivations snapshot: dict = {} for claim in engine.claims(): key = _claim_key(claim) assert key not in snapshot, f"duplicate active claim key: {key}" snapshot[key] = round(float(claim.confidence), 8) return snapshot def test_snapshot_is_deterministic(tmp_path): a = compute_snapshot(tmp_path / "a") b = compute_snapshot(tmp_path / "b") assert a == b, "confidence pipeline must be deterministic over fixed evidence" def test_all_confidences_in_unit_interval(tmp_path): snapshot = compute_snapshot(tmp_path / "bounds") assert snapshot for key, conf in snapshot.items(): assert math.isfinite(conf), f"non-finite confidence for {key}" assert 0.0 < conf <= 1.0, f"confidence {conf} outside (0, 1] for {key}" def test_confidence_snapshot_matches_golden(tmp_path): snapshot = compute_snapshot(tmp_path / "golden") assert snapshot, "pipeline produced no claims to snapshot" if os.environ.get(UPDATE_ENV): GOLDEN_PATH.parent.mkdir(parents=True, exist_ok=True) GOLDEN_PATH.write_text( json.dumps(snapshot, sort_keys=True, indent=2) + "\n", encoding="utf-8", ) pytest.skip( f"golden baseline regenerated at {GOLDEN_PATH}; " "review the diff and commit it" ) if not GOLDEN_PATH.exists(): pytest.skip( "no committed confidence baseline yet; generate one with " "`python scripts/update_goldens.py` (writes " f"{GOLDEN_PATH.relative_to(REPO_ROOT)}), then commit it" ) golden = json.loads(GOLDEN_PATH.read_text(encoding="utf-8")) removed = sorted(set(golden) - set(snapshot)) added = sorted(set(snapshot) - set(golden)) assert not removed and not added, ( "derived claim set drifted from the golden baseline.\n" f" claims missing vs baseline ({len(removed)}): {removed}\n" f" new claims vs baseline ({len(added)}): {added}\n" "If this change is intentional, regenerate with " "`python scripts/update_goldens.py` and commit the diff." ) drift = { key: {"golden": golden[key], "current": snapshot[key]} for key in golden if abs(float(golden[key]) - float(snapshot[key])) > TOLERANCE } assert not drift, ( f"confidence drifted beyond tolerance {TOLERANCE} for " f"{len(drift)} claim(s):\n" + json.dumps(drift, sort_keys=True, indent=2) + "\nIf this calibration change is intentional, regenerate with " "`python scripts/update_goldens.py` and commit the diff." )