"""Corpus-wide validation: structure, coverage, counts, and behavioral regression. These tests are the contract for the scenario corpus itself: * every scenario file parses and every scenario carries the required fields; * IDs are globally unique; * every scenario documents its precedent and its empathy analysis; * the corpus meets the funded size floor (>= 200 scenarios); * each base attack family has substantial coverage (>= 20 scenarios); * the composite family exists, has >= 12 scenarios, each combining two or more distinct base families, and the composite set as a whole touches every base family; * every scenario, run through the engine against the current constitutional parameters, produces exactly its expected verdict. """ from __future__ import annotations from collections import Counter from pathlib import Path import pytest import yaml from fabletest.engine import run_scenario from fabletest.harness import load_corpus from fabletest.params import load_parameters from fabletest.taxonomy import FAMILIES ROOT = Path(__file__).resolve().parent.parent SCENARIO_DIR = ROOT / "scenarios" PARAMS_PATH = ROOT / "constitution" / "parameters.yaml" BASE_FAMILIES = {f for f in FAMILIES if f != "composite"} VALID_FAMILIES = set(FAMILIES) | {"composite"} REQUIRED_FIELDS = ( "id", "title", "family", "severity", "precedent", "actors", "moves", "expected", "empathy", ) MIN_PRECEDENT_CHARS = 80 MIN_RATIONALE_CHARS = 40 CORPUS_SIZE_FLOOR = 200 BASE_FAMILY_FLOOR = 20 COMPOSITE_FLOOR = 12 def _raw_scenarios() -> list[dict]: """Load every scenario as raw YAML, independent of the model layer, so structural defects are reported precisely rather than swallowed by deserialization errors. """ out: list[dict] = [] for path in sorted(SCENARIO_DIR.glob("*.yaml")): with path.open("r", encoding="utf-8") as fh: doc = yaml.safe_load(fh) assert isinstance(doc, dict), f"{path.name}: top level must be a mapping" scenarios = doc.get("scenarios") assert isinstance(scenarios, list) and scenarios, ( f"{path.name}: must contain a non-empty 'scenarios' list" ) for sc in scenarios: assert isinstance(sc, dict), f"{path.name}: scenario entries must be mappings" sc.setdefault("_source", path.name) out.append(sc) return out RAW = _raw_scenarios() RAW_BY_ID = {sc.get("id"): sc for sc in RAW} # --------------------------------------------------------------------------- # Structural validation # --------------------------------------------------------------------------- def test_corpus_size_floor(): assert len(RAW) >= CORPUS_SIZE_FLOOR, ( f"corpus has {len(RAW)} scenarios; milestone floor is {CORPUS_SIZE_FLOOR}" ) def test_unique_ids(): ids = [sc.get("id") for sc in RAW] dupes = [i for i, n in Counter(ids).items() if n > 1] assert not dupes, f"duplicate scenario ids: {dupes}" assert all(ids), "every scenario must carry a non-empty id" @pytest.mark.parametrize("sc", RAW, ids=lambda sc: f"{sc.get('_source')}::{sc.get('id')}") def test_required_fields(sc): missing = [f for f in REQUIRED_FIELDS if f not in sc or sc[f] in (None, "", [])] assert not missing, f"{sc.get('id')}: missing required fields {missing}" @pytest.mark.parametrize("sc", RAW, ids=lambda sc: str(sc.get("id"))) def test_family_valid(sc): assert sc["family"] in VALID_FAMILIES, ( f"{sc['id']}: unknown family {sc['family']!r}; valid: {sorted(VALID_FAMILIES)}" ) @pytest.mark.parametrize("sc", RAW, ids=lambda sc: str(sc.get("id"))) def test_precedent_documented(sc): precedent = str(sc["precedent"]).strip() assert len(precedent) >= MIN_PRECEDENT_CHARS, ( f"{sc['id']}: precedent must document the historical or game-theoretic " f"basis ({MIN_PRECEDENT_CHARS}+ chars); got {len(precedent)}" ) @pytest.mark.parametrize("sc", RAW, ids=lambda sc: str(sc.get("id"))) def test_moves_reference_declared_actors(sc): actor_ids = {a["id"] for a in sc["actors"]} assert actor_ids, f"{sc['id']}: scenario declares no actors" for move in sc["moves"]: assert move.get("actor") in actor_ids, ( f"{sc['id']}: move step {move.get('step')} references undeclared " f"actor {move.get('actor')!r}" ) assert move.get("action"), f"{sc['id']}: every move must name an action" @pytest.mark.parametrize("sc", RAW, ids=lambda sc: str(sc.get("id"))) def test_expected_block_well_formed(sc): expected = sc["expected"] assert expected.get("verdict") in {"blocked", "contained"}, ( f"{sc['id']}: regression corpus scenarios must expect 'blocked' or " f"'contained' under the current text; got {expected.get('verdict')!r}" ) defenses = expected.get("defenses") assert isinstance(defenses, list) and defenses, ( f"{sc['id']}: expected block must enumerate the textual defenses relied on" ) # --------------------------------------------------------------------------- # Empathy metric validation — every scenario, no exceptions # --------------------------------------------------------------------------- @pytest.mark.parametrize("sc", RAW, ids=lambda sc: str(sc.get("id"))) def test_empathy_block(sc): emp = sc["empathy"] actor_ids = {a["id"] for a in sc["actors"]} assert emp.get("worst_off") in actor_ids, ( f"{sc['id']}: empathy.worst_off must identify a declared actor" ) rationale = str(emp.get("rationale", "")).strip() assert len(rationale) >= MIN_RATIONALE_CHARS, ( f"{sc['id']}: empathy.rationale must explain why this cohort is " f"worst-off and how the harm compounds" ) fb = emp.get("floor_if_blocked") fe = emp.get("floor_if_exploited") for label, val in (("floor_if_blocked", fb), ("floor_if_exploited", fe)): assert isinstance(val, (int, float)) and 0.0 <= val <= 1.0, ( f"{sc['id']}: empathy.{label} must be a number in [0, 1]; got {val!r}" ) if sc["expected"].get("verdict") == "blocked": assert fb > fe, ( f"{sc['id']}: when the attack is blocked, the worst-off cohort's " f"floor must strictly exceed the exploited floor — otherwise the " f"defense is not actually protecting anyone" ) # --------------------------------------------------------------------------- # Family coverage # --------------------------------------------------------------------------- def _family_counts() -> Counter: return Counter(sc["family"] for sc in RAW) def test_every_base_family_covered(): counts = _family_counts() for fam in BASE_FAMILIES: assert counts.get(fam, 0) >= BASE_FAMILY_FLOOR, ( f"family {fam!r} has {counts.get(fam, 0)} scenarios; " f"floor is {BASE_FAMILY_FLOOR}" ) def test_composite_family_size(): counts = _family_counts() assert counts.get("composite", 0) >= COMPOSITE_FLOOR, ( f"composite family has {counts.get('composite', 0)} scenarios; " f"floor is {COMPOSITE_FLOOR}" ) def test_composite_combines_valid(): composites = [sc for sc in RAW if sc["family"] == "composite"] for sc in composites: combines = sc.get("combines") assert isinstance(combines, list), ( f"{sc['id']}: composite scenarios must declare 'combines'" ) assert len(set(combines)) >= 2, ( f"{sc['id']}: composite scenarios must combine at least two " f"distinct base families; got {combines}" ) unknown = set(combines) - BASE_FAMILIES assert not unknown, f"{sc['id']}: unknown base families in combines: {unknown}" def test_composite_set_covers_all_base_families(): touched: set[str] = set() for sc in RAW: if sc["family"] == "composite": touched.update(sc.get("combines", [])) missing = BASE_FAMILIES - touched assert not missing, ( f"composite scenarios collectively must exercise every base family; " f"missing: {sorted(missing)}" ) def test_non_composite_scenarios_do_not_declare_combines(): offenders = [ sc["id"] for sc in RAW if sc["family"] != "composite" and sc.get("combines") ] assert not offenders, ( f"only composite scenarios may declare 'combines': {offenders}" ) # --------------------------------------------------------------------------- # Behavioral regression: every scenario produces its expected verdict # under the current constitutional parameters # --------------------------------------------------------------------------- CORPUS = load_corpus(SCENARIO_DIR) PARAMS = load_parameters(PARAMS_PATH) def test_loader_agrees_with_raw_count(): assert len(CORPUS) == len(RAW), ( f"model loader returned {len(CORPUS)} scenarios but raw YAML contains " f"{len(RAW)} — the loader is silently dropping or duplicating entries" ) @pytest.mark.parametrize("scenario", CORPUS, ids=lambda s: s.id) def test_scenario_produces_expected_verdict(scenario): result = run_scenario(scenario, PARAMS) expected_verdict = scenario.expected["verdict"] assert result.verdict == expected_verdict, ( f"{scenario.id}: expected {expected_verdict!r}, engine produced " f"{result.verdict!r} — either the constitutional text regressed or " f"the scenario encoding is wrong; see docs/runbook.md before editing " f"the scenario" ) @pytest.mark.parametrize("scenario", CORPUS, ids=lambda s: s.id) def test_scenario_empathy_floor_holds(scenario): result = run_scenario(scenario, PARAMS) declared_floor = scenario.empathy["floor_if_blocked"] if result.verdict in {"blocked", "contained"}: assert result.empathy_floor >= declared_floor - 1e-9, ( f"{scenario.id}: worst-off cohort floor {result.empathy_floor:.3f} " f"fell below the declared floor {declared_floor:.3f} even though " f"the attack was {result.verdict} — the defense is leaking harm " f"onto the worst-off participant" )