"""Structural integrity tests for the 30 historical stress-event dossiers. These tests are the benchmark's first line of defense against silent decay: they assert that the full set of 30 dossiers exists, that every dossier is well-formed YAML, that the corpus is structurally consistent (all dossiers share a common top-level shape), and that no dossier has degraded into a stub. They are deliberately *schema-agnostic* at the field-name level — the canonical field-by-field validation happens inside ``incumbent_benchmark.schema`` when the harness loads a dossier — so these tests stay green across additive schema evolution while still catching the failure modes that matter for an open-source corpus: a deleted file, a truncated file, a hand-edited dossier that no longer matches its siblings. Run with: pytest benchmark/tests/test_dossiers.py -v """ from __future__ import annotations import pathlib import re from typing import Any, Dict, Iterator, List import pytest import yaml DOSSIER_DIR = pathlib.Path(__file__).resolve().parents[1] / "dossiers" # The funded corpus: 30 events across contested certifications/transfers of # power, emergency-power invocations, fiscal/formation deadlock, court # capture, secession crises, and executive self-coups. Adding new dossiers # is encouraged (extra files do not fail this test); *removing* any of the # original 30 is a regression and fails loudly. EXPECTED_DOSSIERS = { # Contested certifications & transfers of power "us-2020-certification.yaml", "us-1876-hayes-tilden.yaml", "us-2000-bush-v-gore.yaml", "kenya-2007-election.yaml", "gambia-2016-transition.yaml", "bolivia-2019-election.yaml", "australia-1975-dismissal.yaml", # Emergency powers "weimar-1930-article48.yaml", "india-1975-emergency.yaml", "france-1961-article16.yaml", "philippines-1972-martial-law.yaml", "hungary-2020-enabling-act.yaml", "south-korea-2024-martial-law.yaml", # Fiscal deadlock & government formation "us-2018-shutdown.yaml", "us-2011-debt-ceiling.yaml", "belgium-2010-formation.yaml", # Court capture & judicial crises "us-1937-court-packing.yaml", "hungary-2011-court-capture.yaml", "poland-2015-tribunal.yaml", "venezuela-2017-tsj.yaml", "israel-2023-judicial-overhaul.yaml", # Secession & dissolution "us-1860-secession.yaml", "canada-1995-quebec.yaml", "spain-2017-catalonia.yaml", "czechoslovakia-1992-dissolution.yaml", # Executive self-coups & inter-branch warfare "peru-1992-autogolpe.yaml", "russia-1993-crisis.yaml", "honduras-2009-zelaya.yaml", "sri-lanka-2018-crisis.yaml", "uk-2019-prorogation.yaml", } # Anti-stub floor: a "deeply researched dossier" cannot plausibly fit in # fewer characters than this. Real dossiers in this corpus run 5–20x larger; # the floor only exists to catch truncation or accidental gutting. MIN_DOSSIER_CHARS = 2500 KEBAB_CASE = re.compile(r"^[a-z0-9]+(-[a-z0-9]+)*\.yaml$") def dossier_paths() -> List[pathlib.Path]: return sorted(DOSSIER_DIR.glob("*.yaml")) @pytest.fixture(scope="module") def raw_texts() -> Dict[str, str]: return {p.name: p.read_text(encoding="utf-8") for p in dossier_paths()} @pytest.fixture(scope="module") def parsed(raw_texts: Dict[str, str]) -> Dict[str, Any]: out: Dict[str, Any] = {} for name, text in raw_texts.items(): try: out[name] = yaml.safe_load(text) except yaml.YAMLError as exc: # pragma: no cover - failure path pytest.fail(f"{name} is not valid YAML: {exc}") return out # --------------------------------------------------------------------------- # Corpus completeness # --------------------------------------------------------------------------- def test_dossier_directory_exists() -> None: assert DOSSIER_DIR.is_dir(), f"missing dossier directory: {DOSSIER_DIR}" def test_all_thirty_dossiers_present() -> None: found = {p.name for p in dossier_paths()} missing = sorted(EXPECTED_DOSSIERS - found) assert not missing, ( "The benchmark corpus is missing funded dossiers. " f"Missing: {missing}. Removing a dossier is a breaking change to the " "benchmark and requires a versioned deprecation, not a deletion." ) assert len(found) >= 30 def test_filenames_are_kebab_case() -> None: bad = [p.name for p in dossier_paths() if not KEBAB_CASE.match(p.name)] assert not bad, f"dossier filenames must be kebab-case .yaml: {bad}" def test_index_file_exists() -> None: assert (DOSSIER_DIR / "INDEX.md").is_file(), "dossiers/INDEX.md is missing" # --------------------------------------------------------------------------- # Per-dossier well-formedness # --------------------------------------------------------------------------- def test_every_dossier_parses_to_nonempty_mapping(parsed: Dict[str, Any]) -> None: for name, doc in parsed.items(): assert isinstance(doc, dict), f"{name}: top level must be a mapping" assert doc, f"{name}: dossier is empty" assert len(doc) >= 4, ( f"{name}: only {len(doc)} top-level sections; a real dossier " "carries metadata, actors, incumbent constraints, timeline and " "outcome material" ) def test_no_dossier_is_a_stub(raw_texts: Dict[str, str]) -> None: thin = { name: len(text) for name, text in raw_texts.items() if len(text) < MIN_DOSSIER_CHARS } assert not thin, ( f"dossiers below the anti-stub floor of {MIN_DOSSIER_CHARS} chars: " f"{thin}. These appear truncated or gutted." ) def test_no_yaml_merge_or_anchors_across_files(raw_texts: Dict[str, str]) -> None: """Each dossier must be self-contained: no include directives. YAML has no cross-file include, but people bolt them on with custom tags. Dossiers are required to stand alone so they remain independently citable and independently replayable. """ for name, text in raw_texts.items(): assert "!include" not in text, f"{name}: cross-file includes forbidden" # --------------------------------------------------------------------------- # Cross-corpus structural consistency # --------------------------------------------------------------------------- def test_dossiers_share_a_common_top_level_shape(parsed: Dict[str, Any]) -> None: """All 30 dossiers were authored against one schema; the intersection of their top-level keys must therefore be substantial. A dossier that has drifted away from the shared shape will shrink the intersection and fail here before it fails (more cryptically) inside the harness.""" key_sets = {name: set(doc.keys()) for name, doc in parsed.items()} common = set.intersection(*key_sets.values()) if key_sets else set() assert len(common) >= 3, ( "dossiers no longer share a common top-level shape; " f"shared keys: {sorted(common)}. " f"Per-file keys: { {n: sorted(k) for n, k in key_sets.items()} }" ) # Every dossier must contain the full common core, by construction — # this re-statement exists so a future refactor that special-cases one # dossier gets flagged with the offending file named. for name, keys in key_sets.items(): assert common <= keys, f"{name} dropped shared keys {common - keys}" def _lists_of_mappings(node: Any) -> Iterator[List[Dict[str, Any]]]: if isinstance(node, dict): for value in node.values(): yield from _lists_of_mappings(value) elif isinstance(node, list): if node and all(isinstance(item, dict) for item in node): yield node # type: ignore[misc] for item in node: yield from _lists_of_mappings(item) def test_every_dossier_contains_structured_actor_or_move_tables( parsed: Dict[str, Any] ) -> None: """A simulation config is more than prose: it must contain at least one structured table (a list of mappings) — actors with incentives, permitted moves, or a timeline — with two or more entries. A constitutional stress event with a single actor is not a coordination problem.""" for name, doc in parsed.items(): tables = [t for t in _lists_of_mappings(doc) if len(t) >= 2] assert tables, ( f"{name}: contains no list-of-mappings with >= 2 entries; " "actors/moves/timeline must be structured data, not prose blobs" ) def test_corpus_covers_multiple_countries_and_centuries( raw_texts: Dict[str, str] ) -> None: """Coarse diversity guard derived from filenames: the corpus must never collapse into a single-country, single-era benchmark.""" names = list(raw_texts) countries = {n.split("-", 1)[0] for n in names} assert len(countries) >= 10, f"only {len(countries)} countries: {countries}" years = set() for n in names: m = re.search(r"(1[89]\d{2}|20\d{2})", n) if m: years.add(int(m.group(1))) assert years, "no parseable years in dossier filenames" assert min(years) < 1950 and max(years) >= 2015, ( f"era spread collapsed: years present = {sorted(years)}" )