"""Golden-file confidence regression harness.

The harness runs the full default-deriver pipeline over the committed
sample corpus and snapshots a map of

    "subject|predicate|canonical-json(value)"  ->  confidence (rounded)

for every active claim. The snapshot is compared against a committed
baseline at ``tests/regression/golden/confidence.golden.json``.

Workflow:
  * Generate / regenerate the baseline:  ``python scripts/update_goldens.py``
    (or run pytest with ``MNEMA_UPDATE_GOLDEN=1``), then commit the file.
  * If no baseline exists yet, the comparison test SKIPS with instructions
    rather than asserting against fabricated numbers.
  * Structural tests (determinism, unit-interval bounds, key uniqueness)
    always run regardless of the baseline.

Any confidence drift beyond TOLERANCE, or any change in the derived claim
set, fails with a readable diff so calibration changes are made
deliberately, not accidentally.
"""

from __future__ import annotations

import json
import math
import os
from pathlib import Path

import pytest

from mnema.core.keys import KeyPair
from mnema.core.log import OperationLog
from mnema.derive.engine import DerivationEngine
from mnema.derive.loaders import load_calendar, load_notes, load_photos

REPO_ROOT = Path(__file__).resolve().parents[2]
SAMPLES_DIR = REPO_ROOT / "data" / "samples"
GOLDEN_PATH = Path(__file__).resolve().parent / "golden" / "confidence.golden.json"
TOLERANCE = 1e-6
UPDATE_ENV = "MNEMA_UPDATE_GOLDEN"


def _load_corpus():
    evidence = []
    evidence += load_calendar(SAMPLES_DIR / "calendar.sample.jsonl")
    evidence += load_notes(SAMPLES_DIR / "notes.sample.jsonl")
    evidence += load_photos(SAMPLES_DIR / "photos.sample.jsonl")
    assert evidence, "sample corpus must not be empty"
    return evidence


def _claim_key(claim) -> str:
    return "|".join(
        [
            str(claim.subject),
            str(claim.predicate),
            json.dumps(claim.value, sort_keys=True, default=str),
        ]
    )


def compute_snapshot(workdir: Path) -> dict:
    """Run the default pipeline over the sample corpus; return the
    claim-key -> confidence map. Keys are content-based (subject /
    predicate / value), so the snapshot is independent of signing keys
    and operation ids."""
    workdir.mkdir(parents=True, exist_ok=True)
    log = OperationLog(workdir / "ops.jsonl")
    engine = DerivationEngine(log=log, keypair=KeyPair.generate())
    engine.ingest(_load_corpus())
    engine.run()
    engine.run()  # settle any higher-order derivations

    snapshot: dict = {}
    for claim in engine.claims():
        key = _claim_key(claim)
        assert key not in snapshot, f"duplicate active claim key: {key}"
        snapshot[key] = round(float(claim.confidence), 8)
    return snapshot


def test_snapshot_is_deterministic(tmp_path):
    a = compute_snapshot(tmp_path / "a")
    b = compute_snapshot(tmp_path / "b")
    assert a == b, "confidence pipeline must be deterministic over fixed evidence"


def test_all_confidences_in_unit_interval(tmp_path):
    snapshot = compute_snapshot(tmp_path / "bounds")
    assert snapshot
    for key, conf in snapshot.items():
        assert math.isfinite(conf), f"non-finite confidence for {key}"
        assert 0.0 < conf <= 1.0, f"confidence {conf} outside (0, 1] for {key}"


def test_confidence_snapshot_matches_golden(tmp_path):
    snapshot = compute_snapshot(tmp_path / "golden")
    assert snapshot, "pipeline produced no claims to snapshot"

    if os.environ.get(UPDATE_ENV):
        GOLDEN_PATH.parent.mkdir(parents=True, exist_ok=True)
        GOLDEN_PATH.write_text(
            json.dumps(snapshot, sort_keys=True, indent=2) + "\n",
            encoding="utf-8",
        )
        pytest.skip(
            f"golden baseline regenerated at {GOLDEN_PATH}; "
            "review the diff and commit it"
        )

    if not GOLDEN_PATH.exists():
        pytest.skip(
            "no committed confidence baseline yet; generate one with "
            "`python scripts/update_goldens.py` (writes "
            f"{GOLDEN_PATH.relative_to(REPO_ROOT)}), then commit it"
        )

    golden = json.loads(GOLDEN_PATH.read_text(encoding="utf-8"))

    removed = sorted(set(golden) - set(snapshot))
    added = sorted(set(snapshot) - set(golden))
    assert not removed and not added, (
        "derived claim set drifted from the golden baseline.\n"
        f"  claims missing vs baseline ({len(removed)}): {removed}\n"
        f"  new claims vs baseline ({len(added)}): {added}\n"
        "If this change is intentional, regenerate with "
        "`python scripts/update_goldens.py` and commit the diff."
    )

    drift = {
        key: {"golden": golden[key], "current": snapshot[key]}
        for key in golden
        if abs(float(golden[key]) - float(snapshot[key])) > TOLERANCE
    }
    assert not drift, (
        f"confidence drifted beyond tolerance {TOLERANCE} for "
        f"{len(drift)} claim(s):\n"
        + json.dumps(drift, sort_keys=True, indent=2)
        + "\nIf this calibration change is intentional, regenerate with "
        "`python scripts/update_goldens.py` and commit the diff."
    )