"""Unit tests for tournament orchestration: determinism, structure, hygiene.""" from __future__ import annotations import json from fable_selfplay.tournament import run_tournament from .conftest import episode_dicts, make_config _VOLATILE_KEYS = {"elapsed_seconds", "timestamp", "started_at", "finished_at"} def _stable(result_dict: dict) -> dict: return {k: v for k, v in result_dict.items() if k not in _VOLATILE_KEYS} def test_episode_count_matches_config(kernel_v02): config = make_config(kernel_v02, {"honest": 4}, seed=3, episodes=4) result = run_tournament(config) assert len(episode_dicts(result)) == 4 def test_same_seed_is_deterministic(kernel_v02): config = make_config(kernel_v02, {"honest": 4}, seed=21, episodes=2) first = _stable(run_tournament(config).to_dict()) second = _stable(run_tournament(config).to_dict()) assert json.dumps(first, sort_keys=True, default=str) == \ json.dumps(second, sort_keys=True, default=str) def test_different_seeds_diverge(kernel_v02): a = run_tournament(make_config(kernel_v02, {"honest": 4}, seed=1, episodes=2)) b = run_tournament(make_config(kernel_v02, {"honest": 4}, seed=2, episodes=2)) dump_a = json.dumps(_stable(a.to_dict()), sort_keys=True, default=str) dump_b = json.dumps(_stable(b.to_dict()), sort_keys=True, default=str) assert dump_a != dump_b def test_honest_only_run_has_no_exploits(small_honest_result): exploits = list(getattr(small_honest_result, "exploits", []) or []) assert exploits == [], ( "an honest-only population must never trigger exploit detectors; " f"got {len(exploits)} record(s)" ) def test_result_serializes_to_json(small_adversarial_result): result_dict = small_adversarial_result.to_dict() # Must round-trip through json without custom encoders. text = json.dumps(result_dict, sort_keys=True, default=str) assert json.loads(text) def test_exploit_records_are_well_formed(small_adversarial_result): exploits = list(getattr(small_adversarial_result, "exploits", []) or []) for exploit in exploits: record = exploit.to_dict() if hasattr(exploit, "to_dict") else dict(exploit) assert record.get("exploit_id"), record assert record.get("detector"), record def test_metrics_present_and_include_empathy(small_honest_result): metrics = small_honest_result.to_dict().get("metrics", {}) assert metrics, "tournament result must report aggregate metrics" assert any("empathy" in key for key in metrics), ( f"empathy metric missing from {sorted(metrics)}" )