suite_version: "0.1"
name: "AI Creates a Programming Language Initial Benchmark Catalog"
purpose: >-
  Seed catalog for evaluating an LLM-first programming language against Python,
  Rust, and TypeScript. This catalog defines task intent, expected capabilities,
  and scoring concerns; later milestones should convert entries into executable
  benchmark fixtures.

baselines:
  - language: "Python"
    target: "current CPython"
    required_practices:
      - "Use declared dependencies only."
      - "Use type hints where they clarify public boundaries."
      - "Use deterministic tests and no hidden network access unless required."
  - language: "Rust"
    target: "current stable Rust"
    required_practices:
      - "Use Cargo project conventions."
      - "Use idiomatic Result and Option handling."
      - "Use declared crates only."
  - language: "TypeScript"
    target: "current TypeScript with Node.js LTS-compatible runtime"
    required_practices:
      - "Use strict compiler settings."
      - "Use declared dependencies only."
      - "Use runtime validation when external input requires it."
  - language: "NewLanguage"
    target: "project prototype version under test"
    required_practices:
      - "Use the real compiler/tooling state available at evaluation time."
      - "Record unsupported features separately from failed attempts."
      - "Use declared packages and capabilities only."

evaluation_modes:
  - id: "one_shot"
    description: "Generate a solution without build or test feedback."
  - id: "compiler_repair"
    description: "Allow up to five repair iterations using compiler/build/lint diagnostics."
  - id: "agent_loop"
    description: "Allow a tool-using agent to run public tests and edit within a fixed budget."
  - id: "maintenance"
    description: "Modify an existing codebase according to a new requirement."

primary_metrics:
  - id: "hidden_test_pass_rate"
    description: "Fraction of hidden tests passed."
  - id: "first_pass_build_success"
    description: "Whether the first generated solution parses, resolves dependencies, builds, and has an entrypoint."
  - id: "repair_iterations"
    description: "Number of feedback/edit cycles required to reach the best submitted solution."
  - id: "hallucinated_api_count"
    description: "Number of nonexistent or wrong-version symbols, modules, methods, packages, or configuration keys."
  - id: "security_defect_count"
    description: "Number and severity of security defects."
  - id: "performance_result"
    description: "Whether runtime and memory constraints are satisfied."
  - id: "maintainability_score"
    description: "Rubric-based review score for clarity, modularity, and auditability."

common_constraints:
  - "Solutions must include a runnable entrypoint or callable API as specified by each task."
  - "Solutions must not rely on undeclared dependencies."
  - "Solutions must not access the network unless the task explicitly allows it."
  - "Solutions must handle invalid input according to the task specification."
  - "Generated tests must not replace or weaken benchmark tests."

tasks:
  - id: "micro_log_event_parser"
    title: "Typed log event parser"
    tier: "micro"
    category: "data_parsing_validation"
    modes: ["one_shot", "compiler_repair", "agent_loop"]
    prompt_summary: >-
      Parse newline-delimited log entries with timestamps, levels, component
      names, and key-value fields into typed events. Return recoverable parse
      errors with line numbers.
    required_capabilities:
      - "string parsing"
      - "typed variants or equivalent"
      - "recoverable error reporting"
      - "timestamp validation"
    acceptance:
      functional:
        - "Parses valid log lines into structured records."
        - "Reports all invalid lines without discarding valid lines."
        - "Preserves field ordering only where specified."
      edge_cases:
        - "Empty input."
        - "Escaped separators in values."
        - "Unknown log levels."
        - "Malformed timestamps."
      performance: "Handles 100000 short lines within the task time limit."
    targeted_failure_modes:
      - "edge_case_failure"
      - "unhandled_error"
      - "stringly_typed_variants"
      - "performance_timeout"
    baseline_notes:
      python: "May use standard datetime and dataclasses."
      rust: "May use standard library only unless date parsing dependency is explicitly permitted by the fixture."
      typescript: "Must validate runtime input shapes explicitly."

  - id: "micro_topological_sort"
    title: "Topological sort with cycle explanation"
    tier: "micro"
    category: "algorithms_edge_cases"
    modes: ["one_shot", "compiler_repair", "agent_loop"]
    prompt_summary: >-
      Given named nodes and directed dependency edges, return a stable
      topological ordering or a minimal human-readable cycle witness.
    required_capabilities:
      - "graph traversal"
      - "deterministic ordering"
      - "typed error result"
    acceptance:
      functional:
        - "Returns a valid order for acyclic graphs."
        - "Reports a real cycle for cyclic graphs."
        - "Uses deterministic tie-breaking by input order."
      edge_cases:
        - "Disconnected graph."
        - "Duplicate edges."
        - "Self-cycle."
        - "Unknown node in edge."
      performance: "Handles 50000 nodes and 200000 edges within the time limit."
    targeted_failure_modes:
      - "logic_error"
      - "performance_timeout"
      - "nondeterminism"
      - "edge_case_failure"

  - id: "micro_interval_merge"
    title: "Interval merge with boundary semantics"
    tier: "micro"
    category: "algorithms_edge_cases"
    modes: ["one_shot", "compiler_repair"]
    prompt_summary: >-
      Merge intervals with explicit open/closed endpoint semantics and report
      invalid intervals. Adjacent intervals merge only when the specification
      says their endpoints overlap.
    required_capabilities:
      - "ordering"
      - "boundary condition handling"
      - "input validation"
    acceptance:
      functional:
        - "Merges overlapping intervals correctly."
        - "Distinguishes open and closed endpoints."
        - "Rejects invalid reversed intervals."
      edge_cases:
        - "Zero-length closed interval."
        - "Zero-length open interval."
        - "Adjacent but non-overlapping intervals."
        - "Unsorted input."
      performance: "O(n log n) expected due to sorting."
    targeted_failure_modes:
      - "edge_case_failure"
      - "logic_error"
      - "test_overfit"

  - id: "micro_expr_eval"
    title: "Small expression parser and evaluator"
    tier: "micro"
    category: "data_parsing_validation"
    modes: ["one_shot", "compiler_repair", "agent_loop"]
    prompt_summary: >-
      Implement a parser and evaluator for arithmetic expressions with
      variables, precedence, parentheses, unary minus, and typed evaluation
      errors.
    required_capabilities:
      - "recursive descent or equivalent parsing"
      - "operator precedence"
      - "typed errors"
      - "deterministic evaluation"
    acceptance:
      functional:
        - "Evaluates valid expressions with the specified precedence."
        - "Reports undefined variables."
        - "Reports division by zero."
        - "Reports syntax errors with positions."
      edge_cases:
        - "Nested parentheses."
        - "Unary minus before parentheses."
        - "Whitespace variants."
        - "Large integer inputs as specified by fixture."
      performance: "Linear or near-linear parse time for valid expressions."
    targeted_failure_modes:
      - "syntax_error"
      - "logic_error"
      - "unhandled_error"
      - "edge_case_failure"

  - id: "micro_canonical_json_diff"
    title: "Canonical JSON-like diff"
    tier: "micro"
    category: "data_parsing_validation"
    modes: ["one_shot", "compiler_repair"]
    prompt_summary: >-
      Compare two JSON-like values and emit a deterministic list of path-based
      differences with stable ordering and clear handling of missing versus null.
    required_capabilities:
      - "recursive data modeling"
      - "deterministic traversal"
      - "serialization"
    acceptance:
      functional:
        - "Reports added, removed, and changed values."
        - "Distinguishes missing fields from null values."
        - "Sorts object keys deterministically."
      edge_cases:
        - "Arrays of different lengths."
        - "Nested objects."
        - "Primitive type changes."
      performance: "Handles deeply nested but valid inputs without stack overflow where language permits."
    targeted_failure_modes:
      - "null_or_optional_misuse"
      - "nondeterminism"
      - "logic_error"

  - id: "micro_lru_cache"
    title: "Least-recently-used cache"
    tier: "micro"
    category: "algorithms_edge_cases"
    modes: ["one_shot", "compiler_repair"]
    prompt_summary: >-
      Implement a fixed-capacity LRU cache with get, put, update, delete, and
      deterministic state inspection for tests.
    required_capabilities:
      - "mutable state discipline"
      - "hash map plus ordering or equivalent"
      - "capacity validation"
    acceptance:
      functional:
        - "Evicts the least recently used item."
        - "Updates recency on get and put."
        - "Handles capacity one."
        - "Rejects zero capacity."
      edge_cases:
        - "Repeated put for same key."
        - "Delete missing key."
        - "Inspection does not alter recency."
      performance: "Expected O(1) get and put for large operation sequences."
    targeted_failure_modes:
      - "logic_error"
      - "performance_timeout"
      - "state_mutation_bug"

  - id: "meso_cli_task_store"
    title: "Persistent CLI task manager"
    tier: "meso"
    category: "command_line_tools"
    modes: ["compiler_repair", "agent_loop"]
    prompt_summary: >-
      Build a command-line task manager that stores tasks in a JSON file,
      supports add/list/done/delete, validates input, and writes atomically.
    required_capabilities:
      - "CLI argument parsing"
      - "filesystem effects"
      - "JSON serialization"
      - "atomic write or safe replacement"
      - "recoverable errors"
    acceptance:
      functional:
        - "Creates a store when missing."
        - "Lists tasks deterministically."
        - "Marks tasks done without changing IDs."
        - "Handles corrupt storage with a clear error."
      security:
        - "Does not write outside the requested store path."
        - "Does not execute shell commands."
      edge_cases:
        - "Empty title."
        - "Concurrent-looking repeated writes in tests."
        - "Deletion of nonexistent task."
      performance: "Handles 10000 tasks within task limits."
    targeted_failure_modes:
      - "cross_file_mismatch"
      - "unhandled_error"
      - "missing_dependency"
      - "security_vulnerability"

  - id: "meso_markdown_link_checker"
    title: "Offline Markdown link checker"
    tier: "meso"
    category: "command_line_tools"
    modes: ["compiler_repair", "agent_loop"]
    prompt_summary: >-
      Scan Markdown files for local links and anchors, report broken links with
      source positions, and avoid network access.
    required_capabilities:
      - "directory traversal"
      - "Markdown link parsing subset"
      - "path normalization"
      - "structured reporting"
    acceptance:
      functional:
        - "Finds inline and reference-style local links in the supported subset."
        - "Validates relative file targets."
        - "Validates heading anchors with documented slug rules."
        - "Reports line and column where practical."
      security:
        - "Prevents path traversal outside the root."
        - "Performs no network requests."
      edge_cases:
        - "Escaped brackets."
        - "Duplicate headings."
        - "Case sensitivity according to fixture platform rules."
      performance: "Handles 1000 small files within the time limit."
    targeted_failure_modes:
      - "path_traversal"
      - "logic_error"
      - "performance_timeout"
      - "hidden_runtime_behavior"

  - id: "meso_config_migrator"
    title: "Versioned configuration migrator"
    tier: "meso"
    category: "data_parsing_validation"
    modes: ["compiler_repair", "agent_loop", "maintenance"]
    prompt_summary: >-
      Migrate configuration documents across schema versions while preserving
      unknown extension fields and reporting invalid versions.
    required_capabilities:
      - "schema modeling"
      - "version dispatch"
      - "data preservation"
      - "typed migration errors"
    acceptance:
      functional:
        - "Migrates v1 and v2 documents to v3."
        - "Rejects unsupported versions."
        - "Preserves documented extension fields."
        - "Provides reversible migration notes where specified."
      edge_cases:
        - "Missing version."
        - "String versus numeric version."
        - "Unknown fields in nested sections."
      performance: "Handles large but valid config files without quadratic traversal."
    targeted_failure_modes:
      - "cross_file_mismatch"
      - "null_or_optional_misuse"
      - "edge_case_failure"

  - id: "meso_secret_redactor"
    title: "Streaming secret redaction pipeline"
    tier: "meso"
    category: "security_sensitive_programming"
    modes: ["compiler_repair", "agent_loop"]
    prompt_summary: >-
      Redact secrets from logs using multiple detectors, preserve line
      structure, report redaction counts, and avoid logging the original secret.
    required_capabilities:
      - "streaming text processing"
      - "safe logging"
      - "pattern matching"
      - "testable detectors"
    acceptance:
      functional:
        - "Redacts tokens matching specified secret patterns."
        - "Preserves non-secret text."
        - "Reports counts by detector."
        - "Handles secrets split by common delimiters as specified."
      security:
        - "Never emits full original secret in output or diagnostics."
        - "Uses deterministic redaction markers."
      edge_cases:
        - "Overlapping matches."
        - "Very long lines."
        - "False-positive allowlist entries."
      performance: "Processes large logs in a streaming or memory-bounded manner."
    targeted_failure_modes:
      - "security_vulnerability"
      - "secret_leakage"
      - "performance_timeout"
      - "logic_error"

  - id: "meso_rate_limiter"
    title: "Deterministic token-bucket rate limiter"
    tier: "meso"
    category: "services_apis"
    modes: ["compiler_repair", "agent_loop"]
    prompt_summary: >-
      Implement a token-bucket rate limiter using an injectable clock so tests
      can deterministically check refill and consume behavior.
    required_capabilities:
      - "time capability abstraction"
      - "numeric edge cases"
      - "state management"
      - "deterministic tests"
    acceptance:
      functional:
        - "Consumes tokens according to capacity and refill rate."
        - "Uses injected time, not global wall-clock time."
        - "Handles fractional refill semantics as specified."
        - "Rejects invalid capacity or rate."
      edge_cases:
        - "Large time jumps."
        - "No time elapsed."
        - "Burst at exact boundary."
      performance: "O(1) per consume operation."
    targeted_failure_modes:
      - "hidden_runtime_behavior"
      - "nondeterminism"
      - "logic_error"

  - id: "meso_worker_pool"
    title: "Bounded worker pool with cancellation"
    tier: "meso"
    category: "concurrent_async_systems"
    modes: ["compiler_repair", "agent_loop"]
    prompt_summary: >-
      Implement a bounded worker pool that processes jobs, returns ordered
      results, propagates errors, and supports cancellation without leaking work.
    required_capabilities:
      - "structured concurrency"
      - "bounded queues"
      - "error propagation"
      - "cancellation"
    acceptance:
      functional:
        - "Limits concurrent workers."
        - "Returns results in input order."
        - "Stops accepting work after cancellation."
        - "Cleans up workers on error."
      edge_cases:
        - "Zero jobs."
        - "Worker error midway."
        - "Cancellation before start."
        - "Cancellation during processing."
      performance: "Demonstrates parallel speedup with fake delayed jobs where runtime supports it."
    targeted_failure_modes:
      - "concurrency_error"
      - "unhandled_error"
      - "nondeterminism"
      - "resource_leak"

  - id: "meso_safe_file_join"
    title: "Safe path resolver"
    tier: "meso"
    category: "security_sensitive_programming"
    modes: ["one_shot", "compiler_repair", "agent_loop"]
    prompt_summary: >-
      Resolve user-provided relative paths under a configured root directory,
      preventing traversal, symlink escape according to fixture rules, and
      platform-specific separator confusion.
    required_capabilities:
      - "path normalization"
      - "security boundary modeling"
      - "error reporting"
    acceptance:
      functional:
        - "Accepts valid paths under root."
        - "Rejects parent traversal attempts."
        - "Rejects absolute paths."
        - "Handles separators consistently."
      security:
        - "Does not follow symlinks outside root when fixture enables symlink tests."
        - "Does not string-concatenate paths unsafely."
      edge_cases:
        - "Encoded-looking path segments."
        - "Repeated separators."
        - "Current-directory segments."
      performance: "O(path length) expected."
    targeted_failure_modes:
      - "security_vulnerability"
      - "path_traversal"
      - "platform_assumption"

  - id: "macro_double_entry_ledger"
    title: "Double-entry ledger core"
    tier: "macro"
    category: "services_apis"
    modes: ["agent_loop", "maintenance"]
    prompt_summary: >-
      Build a ledger library with accounts, journal entries, balanced postings,
      validation, immutable audit history, and deterministic reports.
    required_capabilities:
      - "domain modeling"
      - "invariants"
      - "persistence abstraction"
      - "report generation"
      - "error handling"
    acceptance:
      functional:
        - "Rejects unbalanced journal entries."
        - "Prevents mutation of posted historical entries except through reversal."
        - "Produces deterministic account balances."
        - "Imports and exports fixture data."
      security:
        - "Does not expose unsafe deserialization."
      edge_cases:
        - "Zero-value postings according to specified rule."
        - "Currency mismatch."
        - "Reversal of nonexistent entry."
        - "Duplicate IDs."
      performance: "Handles 100000 postings within defined time and memory limits."
    targeted_failure_modes:
      - "logic_error"
      - "cross_file_mismatch"
      - "invariant_violation"
      - "poor_maintainability"

  - id: "macro_dependency_resolver"
    title: "Package dependency resolver"
    tier: "macro"
    category: "algorithms_edge_cases"
    modes: ["agent_loop"]
    prompt_summary: >-
      Resolve package versions with semantic constraints, conflicts, and
      deterministic explanation of unsatisfiable requirements.
    required_capabilities:
      - "constraint solving"
      - "semantic version comparison"
      - "backtracking or equivalent"
      - "conflict explanation"
    acceptance:
      functional:
        - "Finds a valid resolution when one exists."
        - "Reports a real minimal or near-minimal conflict when impossible."
        - "Handles pre-release rules as specified."
        - "Uses deterministic tie-breaking."
      edge_cases:
        - "Diamond dependency."
        - "Conflicting transitive constraints."
        - "Multiple valid highest versions."
      performance: "Handles medium fixture graphs within time limits."
    targeted_failure_modes:
      - "logic_error"
      - "performance_timeout"
      - "nondeterminism"
      - "edge_case_failure"

  - id: "macro_http_json_service"
    title: "Small HTTP JSON service"
    tier: "macro"
    category: "services_apis"
    modes: ["agent_loop", "maintenance"]
    prompt_summary: >-
      Build a small JSON API service with typed request validation, explicit
      error responses, idempotent create operation, and deterministic tests
      against an in-process server or equivalent.
    required_capabilities:
      - "routing"
      - "JSON schema validation"
      - "error mapping"
      - "idempotency"
      - "state management"
    acceptance:
      functional:
        - "Accepts valid create/get/list requests."
        - "Rejects invalid JSON with structured errors."
        - "Implements idempotency key semantics."
        - "Provides deterministic test setup and teardown."
      security:
        - "Does not leak internal errors."
        - "Rejects oversized payloads according to fixture limits."
      edge_cases:
        - "Duplicate idempotency key."
        - "Missing required field."
        - "Unknown field handling as specified."
      performance: "Meets basic latency constraints under concurrent fixture requests."
    targeted_failure_modes:
      - "wrong_api_version"
      - "security_vulnerability"
      - "concurrency_error"
      - "cross_file_mismatch"

  - id: "macro_rules_engine"
    title: "Declarative rules engine"
    tier: "macro"
    category: "data_parsing_validation"
    modes: ["agent_loop"]
    prompt_summary: >-
      Implement a small rules engine that loads declarative rules, validates
      them, evaluates facts, detects conflicting rules, and reports traces.
    required_capabilities:
      - "DSL parsing or structured config parsing"
      - "validation"
      - "deterministic evaluation"
      - "trace reporting"
    acceptance:
      functional:
        - "Evaluates matching rules in priority order."
        - "Rejects invalid rule references."
        - "Detects specified conflict patterns."
        - "Produces human-readable and machine-readable traces."
      edge_cases:
        - "No matching rule."
        - "Circular rule reference."
        - "Equal priority conflict."
      performance: "Handles large rule sets with expected indexing or filtering."
    targeted_failure_modes:
      - "logic_error"
      - "nondeterminism"
      - "performance_timeout"
      - "poor_maintainability"

  - id: "macro_crdt_note_merge"
    title: "Offline note merge component"
    tier: "macro"
    category: "concurrent_async_systems"
    modes: ["agent_loop"]
    prompt_summary: >-
      Merge offline edits to a note-like document using a specified CRDT-style
      operation model, preserving convergence, idempotence, and causal metadata.
    required_capabilities:
      - "distributed data modeling"
      - "merge invariants"
      - "property testing"
      - "deterministic serialization"
    acceptance:
      functional:
        - "Converges for operations applied in different orders."
        - "Handles duplicate operations idempotently."
        - "Rejects malformed operations."
        - "Serializes state deterministically."
      edge_cases:
        - "Concurrent insert at same position."
        - "Delete before receiving insert."
        - "Duplicate actor IDs according to fixture rule."
      performance: "Handles fixture operation histories within limits."
    targeted_failure_modes:
      - "logic_error"
      - "invariant_violation"
      - "nondeterminism"
      - "performance_timeout"

  - id: "maint_add_schema_versioning"
    title: "Add schema versioning to an existing store"
    tier: "maintenance"
    category: "maintenance_refactoring"
    modes: ["maintenance"]
    prompt_summary: >-
      Given an existing small persisted data store, add schema versioning and
      migration from v1 to v2 without breaking existing behavior.
    required_capabilities:
      - "multi-file modification"
      - "backward compatibility"
      - "migration testing"
      - "interface preservation"
    acceptance:
      functional:
        - "Existing tests continue to pass."
        - "Old fixture files migrate correctly."
        - "New writes include the v2 schema marker."
        - "Invalid future versions fail safely."
      maintainability:
        - "Patch is focused and does not rewrite unrelated code."
        - "Public API remains stable unless specification requires change."
      edge_cases:
        - "Missing version marker."
        - "Corrupt file."
        - "Mixed old and new optional fields."
    targeted_failure_modes:
      - "cross_file_mismatch"
      - "regression"
      - "test_overfit"
      - "poor_maintainability"

  - id: "maint_add_authorization"
    title: "Add authorization checks to existing API"
    tier: "maintenance"
    category: "maintenance_refactoring"
    modes: ["maintenance"]
    prompt_summary: >-
      Given an existing API service, add role-based authorization to selected
      operations without changing unrelated behavior or leaking resource
      existence across unauthorized boundaries.
    required_capabilities:
      - "security policy modeling"
      - "minimal patching"
      - "regression avoidance"
      - "test updates"
    acceptance:
      functional:
        - "Allows authorized roles."
        - "Rejects unauthorized roles with specified response."
        - "Preserves existing successful behavior."
        - "Adds focused tests for authorization."
      security:
        - "Does not leak secret fields."
        - "Does not reveal forbidden resource existence when policy forbids it."
      edge_cases:
        - "Missing authentication context."
        - "Unknown role."
        - "Admin override according to fixture rule."
    targeted_failure_modes:
      - "security_vulnerability"
      - "authorization_bypass"
      - "regression"
      - "cross_file_mismatch"

  - id: "maint_optimize_without_regression"
    title: "Optimize slow grouping code"
    tier: "maintenance"
    category: "maintenance_refactoring"
    modes: ["maintenance"]
    prompt_summary: >-
      Given correct but slow grouping/reporting code, improve performance while
      preserving output ordering, public API, and edge-case behavior.
    required_capabilities:
      - "performance reasoning"
      - "regression testing"
      - "minimal refactoring"
      - "deterministic output"
    acceptance:
      functional:
        - "All prior behavior remains unchanged."
        - "Performance fixture meets target."
        - "Output ordering remains stable."
        - "Public API remains compatible."
      maintainability:
        - "Optimization is understandable and localized."
        - "No broad rewrites unrelated to the bottleneck."
      edge_cases:
        - "Empty groups."
        - "Large repeated keys."
        - "Unicode or locale-neutral sorting as specified."
    targeted_failure_modes:
      - "performance_timeout"
      - "regression"
      - "nondeterminism"
      - "poor_maintainability"