"""Unit tests for the aggregate scorecard module. These tests exercise the pure aggregation logic — extraction, normalisation, composites, win/tie tallies, and rendering — without touching the replay harness, so they run anywhere the package installs. """ from __future__ import annotations import json from pathlib import Path import pytest from incumbent_benchmark.aggregate import ( DEFAULT_WEIGHTS, DIMENSIONS, EventScores, aggregate, collect_results, composite, event_from_mapping, extract_scores, normalise_event, render_json, render_markdown, ) def _event(event_id: str, kernel: dict, incumbent: dict, category: str = "test") -> EventScores: return EventScores( event_id=event_id, title=event_id.replace("-", " ").title(), category=category, country="Testland", year="1999", kernel=kernel, incumbent=incumbent, ) def _scores(worst_off, commons, trust, latency) -> dict: return { "worst_off": float(worst_off), "commons_integrity": float(commons), "trust_preservation": float(trust), "latency": float(latency), } # --------------------------------------------------------------------------- # Extraction # --------------------------------------------------------------------------- def test_extract_scores_canonical_keys(): side = {"worst_off": 70, "commons_integrity": 60, "trust_preservation": 55, "latency": 80} assert extract_scores(side) == _scores(70, 60, 55, 80) def test_extract_scores_aliases_and_container(): side = { "scores": { "worst-off": 40, "commons": 50, "trust": 60, "latency_to_resolution": 70, } } assert extract_scores(side) == _scores(40, 50, 60, 70) def test_extract_scores_nested_score_objects(): side = { "worst_off": {"score": 30, "notes": "x"}, "commons_integrity": {"value": 35}, "trust_preservation": 40, "latency": 45, } assert extract_scores(side) == _scores(30, 35, 40, 45) def test_extract_scores_missing_dimension_raises(): with pytest.raises(ValueError, match="latency"): extract_scores({"worst_off": 1, "commons_integrity": 1, "trust_preservation": 1}) # --------------------------------------------------------------------------- # Normalisation and composite # --------------------------------------------------------------------------- def test_normalise_unit_scale_rescaled_to_100(): inc = _scores(0.5, 0.6, 0.7, 0.8) ker = _scores(0.9, 0.4, 0.3, 0.2) n_inc, n_ker = normalise_event(inc, ker) assert n_inc["worst_off"] == pytest.approx(50.0) assert n_ker["worst_off"] == pytest.approx(90.0) def test_normalise_hundred_scale_untouched_and_clamped(): inc = _scores(50, 105, 70, 80) ker = _scores(90, 40, 30, -3) n_inc, n_ker = normalise_event(inc, ker) assert n_inc["commons_integrity"] == 100.0 assert n_ker["latency"] == 0.0 assert n_inc["worst_off"] == 50.0 def test_composite_is_weighted_mean(): scores = _scores(100, 0, 0, 0) value = composite(scores, DEFAULT_WEIGHTS) assert value == pytest.approx(100 * DEFAULT_WEIGHTS["worst_off"]) def test_composite_uniform_scores_equal_score(): scores = _scores(60, 60, 60, 60) assert composite(scores, DEFAULT_WEIGHTS) == pytest.approx(60.0) # --------------------------------------------------------------------------- # Event mapping # --------------------------------------------------------------------------- def test_event_from_mapping_top_level_sides(): data = { "event": {"id": "x-1", "title": "X One", "category": "shutdown", "year": 2001}, "kernel": _scores(80, 70, 60, 50), "incumbent": _scores(40, 30, 20, 10), } ev = event_from_mapping(data) assert ev.event_id == "x-1" assert ev.category == "shutdown" assert ev.kernel["worst_off"] == 80.0 assert ev.incumbent["latency"] == 10.0 def test_event_from_mapping_sides_under_scores_block(): data = { "id": "y-2", "scores": { "kernel": _scores(0.8, 0.7, 0.6, 0.5), "baseline": _scores(0.4, 0.3, 0.2, 0.1), }, } ev = event_from_mapping(data) assert ev.event_id == "y-2" # 0-1 scale detected and rescaled jointly assert ev.kernel["worst_off"] == pytest.approx(80.0) assert ev.incumbent["worst_off"] == pytest.approx(40.0) def test_event_from_mapping_missing_side_raises(): with pytest.raises(ValueError, match="kernel"): event_from_mapping({"id": "z", "incumbent": _scores(1, 1, 1, 1)}) # --------------------------------------------------------------------------- # Aggregation # --------------------------------------------------------------------------- def test_aggregate_means_wins_and_floor(): events = [ _event("a", kernel=_scores(80, 80, 80, 80), incumbent=_scores(40, 40, 40, 40), category="c1"), _event("b", kernel=_scores(50, 50, 50, 50), incumbent=_scores(50.5, 50.5, 50.5, 50.5), category="c1"), _event("c", kernel=_scores(20, 60, 60, 60), incumbent=_scores(70, 70, 70, 70), category="c2"), ] report = aggregate(events, weights=DEFAULT_WEIGHTS, tie_epsilon=1.0) assert report.n_events == 3 # event a: kernel win; event b: tie (delta 0.5); event c: incumbent win assert report.composite_stats["kernel_wins"] == 1 assert report.composite_stats["incumbent_wins"] == 1 assert report.composite_stats["ties"] == 1 # floor: kernel floor is event c (20), incumbent floor is event a (40) assert report.floor["kernel"]["event_id"] == "c" assert report.floor["kernel"]["score"] == pytest.approx(20.0) assert report.floor["incumbent"]["event_id"] == "a" assert report.floor["incumbent"]["score"] == pytest.approx(40.0) # category breakdown assert report.category_stats["c1"]["n"] == 2 assert report.category_stats["c2"]["n"] == 1 # dimension means assert report.dimension_stats["worst_off"]["kernel_mean"] == pytest.approx((80 + 50 + 20) / 3) def test_aggregate_rejects_empty(): with pytest.raises(ValueError): aggregate([]) def test_duplicate_event_ids_rejected(tmp_path: Path): payload = { "id": "dup", "kernel": _scores(1, 1, 1, 1), "incumbent": _scores(1, 1, 1, 1), } (tmp_path / "one.json").write_text(json.dumps(payload), encoding="utf-8") (tmp_path / "two.json").write_text(json.dumps(payload), encoding="utf-8") with pytest.raises(ValueError, match="duplicate"): collect_results(tmp_path) def test_collect_results_skips_aggregate_json(tmp_path: Path): payload = { "id": "solo", "kernel": _scores(60, 60, 60, 60), "incumbent": _scores(40, 40, 40, 40), } (tmp_path / "solo.json").write_text(json.dumps(payload), encoding="utf-8") (tmp_path / "aggregate.json").write_text("{}", encoding="utf-8") events = collect_results(tmp_path) assert [e.event_id for e in events] == ["solo"] # --------------------------------------------------------------------------- # Rendering # --------------------------------------------------------------------------- def test_render_markdown_contains_key_sections(): events = [ _event("a", kernel=_scores(80, 80, 80, 80), incumbent=_scores(40, 40, 40, 40)), _event("b", kernel=_scores(30, 30, 30, 30), incumbent=_scores(60, 60, 60, 60)), ] report = aggregate(events, weights=DEFAULT_WEIGHTS) md = render_markdown(report) assert "# Incumbent Benchmark — Aggregate Scorecard" in md assert "## Headline" in md assert "## By rubric dimension" in md assert "## By event category" in md assert "## Per-event results" in md assert "Worst-off participant" in md assert "METHODOLOGY.md" in md def test_render_json_round_trips(): events = [_event("a", kernel=_scores(80, 80, 80, 80), incumbent=_scores(40, 40, 40, 40))] report = aggregate(events, weights=DEFAULT_WEIGHTS) payload = json.loads(render_json(report)) assert payload["n_events"] == 1 assert payload["events"][0]["id"] == "a" assert payload["events"][0]["composite_delta"] == pytest.approx(40.0) assert set(payload["dimensions"].keys()) == set(DIMENSIONS)