"""Unit tests for the aggregate scorecard module.

These tests exercise the pure aggregation logic — extraction, normalisation,
composites, win/tie tallies, and rendering — without touching the replay
harness, so they run anywhere the package installs.
"""

from __future__ import annotations

import json
from pathlib import Path

import pytest

from incumbent_benchmark.aggregate import (
    DEFAULT_WEIGHTS,
    DIMENSIONS,
    EventScores,
    aggregate,
    collect_results,
    composite,
    event_from_mapping,
    extract_scores,
    normalise_event,
    render_json,
    render_markdown,
)


def _event(event_id: str, kernel: dict, incumbent: dict, category: str = "test") -> EventScores:
    return EventScores(
        event_id=event_id,
        title=event_id.replace("-", " ").title(),
        category=category,
        country="Testland",
        year="1999",
        kernel=kernel,
        incumbent=incumbent,
    )


def _scores(worst_off, commons, trust, latency) -> dict:
    return {
        "worst_off": float(worst_off),
        "commons_integrity": float(commons),
        "trust_preservation": float(trust),
        "latency": float(latency),
    }


# ---------------------------------------------------------------------------
# Extraction
# ---------------------------------------------------------------------------


def test_extract_scores_canonical_keys():
    side = {"worst_off": 70, "commons_integrity": 60, "trust_preservation": 55, "latency": 80}
    assert extract_scores(side) == _scores(70, 60, 55, 80)


def test_extract_scores_aliases_and_container():
    side = {
        "scores": {
            "worst-off": 40,
            "commons": 50,
            "trust": 60,
            "latency_to_resolution": 70,
        }
    }
    assert extract_scores(side) == _scores(40, 50, 60, 70)


def test_extract_scores_nested_score_objects():
    side = {
        "worst_off": {"score": 30, "notes": "x"},
        "commons_integrity": {"value": 35},
        "trust_preservation": 40,
        "latency": 45,
    }
    assert extract_scores(side) == _scores(30, 35, 40, 45)


def test_extract_scores_missing_dimension_raises():
    with pytest.raises(ValueError, match="latency"):
        extract_scores({"worst_off": 1, "commons_integrity": 1, "trust_preservation": 1})


# ---------------------------------------------------------------------------
# Normalisation and composite
# ---------------------------------------------------------------------------


def test_normalise_unit_scale_rescaled_to_100():
    inc = _scores(0.5, 0.6, 0.7, 0.8)
    ker = _scores(0.9, 0.4, 0.3, 0.2)
    n_inc, n_ker = normalise_event(inc, ker)
    assert n_inc["worst_off"] == pytest.approx(50.0)
    assert n_ker["worst_off"] == pytest.approx(90.0)


def test_normalise_hundred_scale_untouched_and_clamped():
    inc = _scores(50, 105, 70, 80)
    ker = _scores(90, 40, 30, -3)
    n_inc, n_ker = normalise_event(inc, ker)
    assert n_inc["commons_integrity"] == 100.0
    assert n_ker["latency"] == 0.0
    assert n_inc["worst_off"] == 50.0


def test_composite_is_weighted_mean():
    scores = _scores(100, 0, 0, 0)
    value = composite(scores, DEFAULT_WEIGHTS)
    assert value == pytest.approx(100 * DEFAULT_WEIGHTS["worst_off"])


def test_composite_uniform_scores_equal_score():
    scores = _scores(60, 60, 60, 60)
    assert composite(scores, DEFAULT_WEIGHTS) == pytest.approx(60.0)


# ---------------------------------------------------------------------------
# Event mapping
# ---------------------------------------------------------------------------


def test_event_from_mapping_top_level_sides():
    data = {
        "event": {"id": "x-1", "title": "X One", "category": "shutdown", "year": 2001},
        "kernel": _scores(80, 70, 60, 50),
        "incumbent": _scores(40, 30, 20, 10),
    }
    ev = event_from_mapping(data)
    assert ev.event_id == "x-1"
    assert ev.category == "shutdown"
    assert ev.kernel["worst_off"] == 80.0
    assert ev.incumbent["latency"] == 10.0


def test_event_from_mapping_sides_under_scores_block():
    data = {
        "id": "y-2",
        "scores": {
            "kernel": _scores(0.8, 0.7, 0.6, 0.5),
            "baseline": _scores(0.4, 0.3, 0.2, 0.1),
        },
    }
    ev = event_from_mapping(data)
    assert ev.event_id == "y-2"
    # 0-1 scale detected and rescaled jointly
    assert ev.kernel["worst_off"] == pytest.approx(80.0)
    assert ev.incumbent["worst_off"] == pytest.approx(40.0)


def test_event_from_mapping_missing_side_raises():
    with pytest.raises(ValueError, match="kernel"):
        event_from_mapping({"id": "z", "incumbent": _scores(1, 1, 1, 1)})


# ---------------------------------------------------------------------------
# Aggregation
# ---------------------------------------------------------------------------


def test_aggregate_means_wins_and_floor():
    events = [
        _event("a", kernel=_scores(80, 80, 80, 80), incumbent=_scores(40, 40, 40, 40), category="c1"),
        _event("b", kernel=_scores(50, 50, 50, 50), incumbent=_scores(50.5, 50.5, 50.5, 50.5), category="c1"),
        _event("c", kernel=_scores(20, 60, 60, 60), incumbent=_scores(70, 70, 70, 70), category="c2"),
    ]
    report = aggregate(events, weights=DEFAULT_WEIGHTS, tie_epsilon=1.0)

    assert report.n_events == 3
    # event a: kernel win; event b: tie (delta 0.5); event c: incumbent win
    assert report.composite_stats["kernel_wins"] == 1
    assert report.composite_stats["incumbent_wins"] == 1
    assert report.composite_stats["ties"] == 1

    # floor: kernel floor is event c (20), incumbent floor is event a (40)
    assert report.floor["kernel"]["event_id"] == "c"
    assert report.floor["kernel"]["score"] == pytest.approx(20.0)
    assert report.floor["incumbent"]["event_id"] == "a"
    assert report.floor["incumbent"]["score"] == pytest.approx(40.0)

    # category breakdown
    assert report.category_stats["c1"]["n"] == 2
    assert report.category_stats["c2"]["n"] == 1

    # dimension means
    assert report.dimension_stats["worst_off"]["kernel_mean"] == pytest.approx((80 + 50 + 20) / 3)


def test_aggregate_rejects_empty():
    with pytest.raises(ValueError):
        aggregate([])


def test_duplicate_event_ids_rejected(tmp_path: Path):
    payload = {
        "id": "dup",
        "kernel": _scores(1, 1, 1, 1),
        "incumbent": _scores(1, 1, 1, 1),
    }
    (tmp_path / "one.json").write_text(json.dumps(payload), encoding="utf-8")
    (tmp_path / "two.json").write_text(json.dumps(payload), encoding="utf-8")
    with pytest.raises(ValueError, match="duplicate"):
        collect_results(tmp_path)


def test_collect_results_skips_aggregate_json(tmp_path: Path):
    payload = {
        "id": "solo",
        "kernel": _scores(60, 60, 60, 60),
        "incumbent": _scores(40, 40, 40, 40),
    }
    (tmp_path / "solo.json").write_text(json.dumps(payload), encoding="utf-8")
    (tmp_path / "aggregate.json").write_text("{}", encoding="utf-8")
    events = collect_results(tmp_path)
    assert [e.event_id for e in events] == ["solo"]


# ---------------------------------------------------------------------------
# Rendering
# ---------------------------------------------------------------------------


def test_render_markdown_contains_key_sections():
    events = [
        _event("a", kernel=_scores(80, 80, 80, 80), incumbent=_scores(40, 40, 40, 40)),
        _event("b", kernel=_scores(30, 30, 30, 30), incumbent=_scores(60, 60, 60, 60)),
    ]
    report = aggregate(events, weights=DEFAULT_WEIGHTS)
    md = render_markdown(report)
    assert "# Incumbent Benchmark — Aggregate Scorecard" in md
    assert "## Headline" in md
    assert "## By rubric dimension" in md
    assert "## By event category" in md
    assert "## Per-event results" in md
    assert "Worst-off participant" in md
    assert "METHODOLOGY.md" in md


def test_render_json_round_trips():
    events = [_event("a", kernel=_scores(80, 80, 80, 80), incumbent=_scores(40, 40, 40, 40))]
    report = aggregate(events, weights=DEFAULT_WEIGHTS)
    payload = json.loads(render_json(report))
    assert payload["n_events"] == 1
    assert payload["events"][0]["id"] == "a"
    assert payload["events"][0]["composite_delta"] == pytest.approx(40.0)
    assert set(payload["dimensions"].keys()) == set(DIMENSIONS)