suite_version: "0.1" name: "AI Creates a Programming Language Initial Benchmark Catalog" purpose: >- Seed catalog for evaluating an LLM-first programming language against Python, Rust, and TypeScript. This catalog defines task intent, expected capabilities, and scoring concerns; later milestones should convert entries into executable benchmark fixtures. baselines: - language: "Python" target: "current CPython" required_practices: - "Use declared dependencies only." - "Use type hints where they clarify public boundaries." - "Use deterministic tests and no hidden network access unless required." - language: "Rust" target: "current stable Rust" required_practices: - "Use Cargo project conventions." - "Use idiomatic Result and Option handling." - "Use declared crates only." - language: "TypeScript" target: "current TypeScript with Node.js LTS-compatible runtime" required_practices: - "Use strict compiler settings." - "Use declared dependencies only." - "Use runtime validation when external input requires it." - language: "NewLanguage" target: "project prototype version under test" required_practices: - "Use the real compiler/tooling state available at evaluation time." - "Record unsupported features separately from failed attempts." - "Use declared packages and capabilities only." evaluation_modes: - id: "one_shot" description: "Generate a solution without build or test feedback." - id: "compiler_repair" description: "Allow up to five repair iterations using compiler/build/lint diagnostics." - id: "agent_loop" description: "Allow a tool-using agent to run public tests and edit within a fixed budget." - id: "maintenance" description: "Modify an existing codebase according to a new requirement." primary_metrics: - id: "hidden_test_pass_rate" description: "Fraction of hidden tests passed." - id: "first_pass_build_success" description: "Whether the first generated solution parses, resolves dependencies, builds, and has an entrypoint." - id: "repair_iterations" description: "Number of feedback/edit cycles required to reach the best submitted solution." - id: "hallucinated_api_count" description: "Number of nonexistent or wrong-version symbols, modules, methods, packages, or configuration keys." - id: "security_defect_count" description: "Number and severity of security defects." - id: "performance_result" description: "Whether runtime and memory constraints are satisfied." - id: "maintainability_score" description: "Rubric-based review score for clarity, modularity, and auditability." common_constraints: - "Solutions must include a runnable entrypoint or callable API as specified by each task." - "Solutions must not rely on undeclared dependencies." - "Solutions must not access the network unless the task explicitly allows it." - "Solutions must handle invalid input according to the task specification." - "Generated tests must not replace or weaken benchmark tests." tasks: - id: "micro_log_event_parser" title: "Typed log event parser" tier: "micro" category: "data_parsing_validation" modes: ["one_shot", "compiler_repair", "agent_loop"] prompt_summary: >- Parse newline-delimited log entries with timestamps, levels, component names, and key-value fields into typed events. Return recoverable parse errors with line numbers. required_capabilities: - "string parsing" - "typed variants or equivalent" - "recoverable error reporting" - "timestamp validation" acceptance: functional: - "Parses valid log lines into structured records." - "Reports all invalid lines without discarding valid lines." - "Preserves field ordering only where specified." edge_cases: - "Empty input." - "Escaped separators in values." - "Unknown log levels." - "Malformed timestamps." performance: "Handles 100000 short lines within the task time limit." targeted_failure_modes: - "edge_case_failure" - "unhandled_error" - "stringly_typed_variants" - "performance_timeout" baseline_notes: python: "May use standard datetime and dataclasses." rust: "May use standard library only unless date parsing dependency is explicitly permitted by the fixture." typescript: "Must validate runtime input shapes explicitly." - id: "micro_topological_sort" title: "Topological sort with cycle explanation" tier: "micro" category: "algorithms_edge_cases" modes: ["one_shot", "compiler_repair", "agent_loop"] prompt_summary: >- Given named nodes and directed dependency edges, return a stable topological ordering or a minimal human-readable cycle witness. required_capabilities: - "graph traversal" - "deterministic ordering" - "typed error result" acceptance: functional: - "Returns a valid order for acyclic graphs." - "Reports a real cycle for cyclic graphs." - "Uses deterministic tie-breaking by input order." edge_cases: - "Disconnected graph." - "Duplicate edges." - "Self-cycle." - "Unknown node in edge." performance: "Handles 50000 nodes and 200000 edges within the time limit." targeted_failure_modes: - "logic_error" - "performance_timeout" - "nondeterminism" - "edge_case_failure" - id: "micro_interval_merge" title: "Interval merge with boundary semantics" tier: "micro" category: "algorithms_edge_cases" modes: ["one_shot", "compiler_repair"] prompt_summary: >- Merge intervals with explicit open/closed endpoint semantics and report invalid intervals. Adjacent intervals merge only when the specification says their endpoints overlap. required_capabilities: - "ordering" - "boundary condition handling" - "input validation" acceptance: functional: - "Merges overlapping intervals correctly." - "Distinguishes open and closed endpoints." - "Rejects invalid reversed intervals." edge_cases: - "Zero-length closed interval." - "Zero-length open interval." - "Adjacent but non-overlapping intervals." - "Unsorted input." performance: "O(n log n) expected due to sorting." targeted_failure_modes: - "edge_case_failure" - "logic_error" - "test_overfit" - id: "micro_expr_eval" title: "Small expression parser and evaluator" tier: "micro" category: "data_parsing_validation" modes: ["one_shot", "compiler_repair", "agent_loop"] prompt_summary: >- Implement a parser and evaluator for arithmetic expressions with variables, precedence, parentheses, unary minus, and typed evaluation errors. required_capabilities: - "recursive descent or equivalent parsing" - "operator precedence" - "typed errors" - "deterministic evaluation" acceptance: functional: - "Evaluates valid expressions with the specified precedence." - "Reports undefined variables." - "Reports division by zero." - "Reports syntax errors with positions." edge_cases: - "Nested parentheses." - "Unary minus before parentheses." - "Whitespace variants." - "Large integer inputs as specified by fixture." performance: "Linear or near-linear parse time for valid expressions." targeted_failure_modes: - "syntax_error" - "logic_error" - "unhandled_error" - "edge_case_failure" - id: "micro_canonical_json_diff" title: "Canonical JSON-like diff" tier: "micro" category: "data_parsing_validation" modes: ["one_shot", "compiler_repair"] prompt_summary: >- Compare two JSON-like values and emit a deterministic list of path-based differences with stable ordering and clear handling of missing versus null. required_capabilities: - "recursive data modeling" - "deterministic traversal" - "serialization" acceptance: functional: - "Reports added, removed, and changed values." - "Distinguishes missing fields from null values." - "Sorts object keys deterministically." edge_cases: - "Arrays of different lengths." - "Nested objects." - "Primitive type changes." performance: "Handles deeply nested but valid inputs without stack overflow where language permits." targeted_failure_modes: - "null_or_optional_misuse" - "nondeterminism" - "logic_error" - id: "micro_lru_cache" title: "Least-recently-used cache" tier: "micro" category: "algorithms_edge_cases" modes: ["one_shot", "compiler_repair"] prompt_summary: >- Implement a fixed-capacity LRU cache with get, put, update, delete, and deterministic state inspection for tests. required_capabilities: - "mutable state discipline" - "hash map plus ordering or equivalent" - "capacity validation" acceptance: functional: - "Evicts the least recently used item." - "Updates recency on get and put." - "Handles capacity one." - "Rejects zero capacity." edge_cases: - "Repeated put for same key." - "Delete missing key." - "Inspection does not alter recency." performance: "Expected O(1) get and put for large operation sequences." targeted_failure_modes: - "logic_error" - "performance_timeout" - "state_mutation_bug" - id: "meso_cli_task_store" title: "Persistent CLI task manager" tier: "meso" category: "command_line_tools" modes: ["compiler_repair", "agent_loop"] prompt_summary: >- Build a command-line task manager that stores tasks in a JSON file, supports add/list/done/delete, validates input, and writes atomically. required_capabilities: - "CLI argument parsing" - "filesystem effects" - "JSON serialization" - "atomic write or safe replacement" - "recoverable errors" acceptance: functional: - "Creates a store when missing." - "Lists tasks deterministically." - "Marks tasks done without changing IDs." - "Handles corrupt storage with a clear error." security: - "Does not write outside the requested store path." - "Does not execute shell commands." edge_cases: - "Empty title." - "Concurrent-looking repeated writes in tests." - "Deletion of nonexistent task." performance: "Handles 10000 tasks within task limits." targeted_failure_modes: - "cross_file_mismatch" - "unhandled_error" - "missing_dependency" - "security_vulnerability" - id: "meso_markdown_link_checker" title: "Offline Markdown link checker" tier: "meso" category: "command_line_tools" modes: ["compiler_repair", "agent_loop"] prompt_summary: >- Scan Markdown files for local links and anchors, report broken links with source positions, and avoid network access. required_capabilities: - "directory traversal" - "Markdown link parsing subset" - "path normalization" - "structured reporting" acceptance: functional: - "Finds inline and reference-style local links in the supported subset." - "Validates relative file targets." - "Validates heading anchors with documented slug rules." - "Reports line and column where practical." security: - "Prevents path traversal outside the root." - "Performs no network requests." edge_cases: - "Escaped brackets." - "Duplicate headings." - "Case sensitivity according to fixture platform rules." performance: "Handles 1000 small files within the time limit." targeted_failure_modes: - "path_traversal" - "logic_error" - "performance_timeout" - "hidden_runtime_behavior" - id: "meso_config_migrator" title: "Versioned configuration migrator" tier: "meso" category: "data_parsing_validation" modes: ["compiler_repair", "agent_loop", "maintenance"] prompt_summary: >- Migrate configuration documents across schema versions while preserving unknown extension fields and reporting invalid versions. required_capabilities: - "schema modeling" - "version dispatch" - "data preservation" - "typed migration errors" acceptance: functional: - "Migrates v1 and v2 documents to v3." - "Rejects unsupported versions." - "Preserves documented extension fields." - "Provides reversible migration notes where specified." edge_cases: - "Missing version." - "String versus numeric version." - "Unknown fields in nested sections." performance: "Handles large but valid config files without quadratic traversal." targeted_failure_modes: - "cross_file_mismatch" - "null_or_optional_misuse" - "edge_case_failure" - id: "meso_secret_redactor" title: "Streaming secret redaction pipeline" tier: "meso" category: "security_sensitive_programming" modes: ["compiler_repair", "agent_loop"] prompt_summary: >- Redact secrets from logs using multiple detectors, preserve line structure, report redaction counts, and avoid logging the original secret. required_capabilities: - "streaming text processing" - "safe logging" - "pattern matching" - "testable detectors" acceptance: functional: - "Redacts tokens matching specified secret patterns." - "Preserves non-secret text." - "Reports counts by detector." - "Handles secrets split by common delimiters as specified." security: - "Never emits full original secret in output or diagnostics." - "Uses deterministic redaction markers." edge_cases: - "Overlapping matches." - "Very long lines." - "False-positive allowlist entries." performance: "Processes large logs in a streaming or memory-bounded manner." targeted_failure_modes: - "security_vulnerability" - "secret_leakage" - "performance_timeout" - "logic_error" - id: "meso_rate_limiter" title: "Deterministic token-bucket rate limiter" tier: "meso" category: "services_apis" modes: ["compiler_repair", "agent_loop"] prompt_summary: >- Implement a token-bucket rate limiter using an injectable clock so tests can deterministically check refill and consume behavior. required_capabilities: - "time capability abstraction" - "numeric edge cases" - "state management" - "deterministic tests" acceptance: functional: - "Consumes tokens according to capacity and refill rate." - "Uses injected time, not global wall-clock time." - "Handles fractional refill semantics as specified." - "Rejects invalid capacity or rate." edge_cases: - "Large time jumps." - "No time elapsed." - "Burst at exact boundary." performance: "O(1) per consume operation." targeted_failure_modes: - "hidden_runtime_behavior" - "nondeterminism" - "logic_error" - id: "meso_worker_pool" title: "Bounded worker pool with cancellation" tier: "meso" category: "concurrent_async_systems" modes: ["compiler_repair", "agent_loop"] prompt_summary: >- Implement a bounded worker pool that processes jobs, returns ordered results, propagates errors, and supports cancellation without leaking work. required_capabilities: - "structured concurrency" - "bounded queues" - "error propagation" - "cancellation" acceptance: functional: - "Limits concurrent workers." - "Returns results in input order." - "Stops accepting work after cancellation." - "Cleans up workers on error." edge_cases: - "Zero jobs." - "Worker error midway." - "Cancellation before start." - "Cancellation during processing." performance: "Demonstrates parallel speedup with fake delayed jobs where runtime supports it." targeted_failure_modes: - "concurrency_error" - "unhandled_error" - "nondeterminism" - "resource_leak" - id: "meso_safe_file_join" title: "Safe path resolver" tier: "meso" category: "security_sensitive_programming" modes: ["one_shot", "compiler_repair", "agent_loop"] prompt_summary: >- Resolve user-provided relative paths under a configured root directory, preventing traversal, symlink escape according to fixture rules, and platform-specific separator confusion. required_capabilities: - "path normalization" - "security boundary modeling" - "error reporting" acceptance: functional: - "Accepts valid paths under root." - "Rejects parent traversal attempts." - "Rejects absolute paths." - "Handles separators consistently." security: - "Does not follow symlinks outside root when fixture enables symlink tests." - "Does not string-concatenate paths unsafely." edge_cases: - "Encoded-looking path segments." - "Repeated separators." - "Current-directory segments." performance: "O(path length) expected." targeted_failure_modes: - "security_vulnerability" - "path_traversal" - "platform_assumption" - id: "macro_double_entry_ledger" title: "Double-entry ledger core" tier: "macro" category: "services_apis" modes: ["agent_loop", "maintenance"] prompt_summary: >- Build a ledger library with accounts, journal entries, balanced postings, validation, immutable audit history, and deterministic reports. required_capabilities: - "domain modeling" - "invariants" - "persistence abstraction" - "report generation" - "error handling" acceptance: functional: - "Rejects unbalanced journal entries." - "Prevents mutation of posted historical entries except through reversal." - "Produces deterministic account balances." - "Imports and exports fixture data." security: - "Does not expose unsafe deserialization." edge_cases: - "Zero-value postings according to specified rule." - "Currency mismatch." - "Reversal of nonexistent entry." - "Duplicate IDs." performance: "Handles 100000 postings within defined time and memory limits." targeted_failure_modes: - "logic_error" - "cross_file_mismatch" - "invariant_violation" - "poor_maintainability" - id: "macro_dependency_resolver" title: "Package dependency resolver" tier: "macro" category: "algorithms_edge_cases" modes: ["agent_loop"] prompt_summary: >- Resolve package versions with semantic constraints, conflicts, and deterministic explanation of unsatisfiable requirements. required_capabilities: - "constraint solving" - "semantic version comparison" - "backtracking or equivalent" - "conflict explanation" acceptance: functional: - "Finds a valid resolution when one exists." - "Reports a real minimal or near-minimal conflict when impossible." - "Handles pre-release rules as specified." - "Uses deterministic tie-breaking." edge_cases: - "Diamond dependency." - "Conflicting transitive constraints." - "Multiple valid highest versions." performance: "Handles medium fixture graphs within time limits." targeted_failure_modes: - "logic_error" - "performance_timeout" - "nondeterminism" - "edge_case_failure" - id: "macro_http_json_service" title: "Small HTTP JSON service" tier: "macro" category: "services_apis" modes: ["agent_loop", "maintenance"] prompt_summary: >- Build a small JSON API service with typed request validation, explicit error responses, idempotent create operation, and deterministic tests against an in-process server or equivalent. required_capabilities: - "routing" - "JSON schema validation" - "error mapping" - "idempotency" - "state management" acceptance: functional: - "Accepts valid create/get/list requests." - "Rejects invalid JSON with structured errors." - "Implements idempotency key semantics." - "Provides deterministic test setup and teardown." security: - "Does not leak internal errors." - "Rejects oversized payloads according to fixture limits." edge_cases: - "Duplicate idempotency key." - "Missing required field." - "Unknown field handling as specified." performance: "Meets basic latency constraints under concurrent fixture requests." targeted_failure_modes: - "wrong_api_version" - "security_vulnerability" - "concurrency_error" - "cross_file_mismatch" - id: "macro_rules_engine" title: "Declarative rules engine" tier: "macro" category: "data_parsing_validation" modes: ["agent_loop"] prompt_summary: >- Implement a small rules engine that loads declarative rules, validates them, evaluates facts, detects conflicting rules, and reports traces. required_capabilities: - "DSL parsing or structured config parsing" - "validation" - "deterministic evaluation" - "trace reporting" acceptance: functional: - "Evaluates matching rules in priority order." - "Rejects invalid rule references." - "Detects specified conflict patterns." - "Produces human-readable and machine-readable traces." edge_cases: - "No matching rule." - "Circular rule reference." - "Equal priority conflict." performance: "Handles large rule sets with expected indexing or filtering." targeted_failure_modes: - "logic_error" - "nondeterminism" - "performance_timeout" - "poor_maintainability" - id: "macro_crdt_note_merge" title: "Offline note merge component" tier: "macro" category: "concurrent_async_systems" modes: ["agent_loop"] prompt_summary: >- Merge offline edits to a note-like document using a specified CRDT-style operation model, preserving convergence, idempotence, and causal metadata. required_capabilities: - "distributed data modeling" - "merge invariants" - "property testing" - "deterministic serialization" acceptance: functional: - "Converges for operations applied in different orders." - "Handles duplicate operations idempotently." - "Rejects malformed operations." - "Serializes state deterministically." edge_cases: - "Concurrent insert at same position." - "Delete before receiving insert." - "Duplicate actor IDs according to fixture rule." performance: "Handles fixture operation histories within limits." targeted_failure_modes: - "logic_error" - "invariant_violation" - "nondeterminism" - "performance_timeout" - id: "maint_add_schema_versioning" title: "Add schema versioning to an existing store" tier: "maintenance" category: "maintenance_refactoring" modes: ["maintenance"] prompt_summary: >- Given an existing small persisted data store, add schema versioning and migration from v1 to v2 without breaking existing behavior. required_capabilities: - "multi-file modification" - "backward compatibility" - "migration testing" - "interface preservation" acceptance: functional: - "Existing tests continue to pass." - "Old fixture files migrate correctly." - "New writes include the v2 schema marker." - "Invalid future versions fail safely." maintainability: - "Patch is focused and does not rewrite unrelated code." - "Public API remains stable unless specification requires change." edge_cases: - "Missing version marker." - "Corrupt file." - "Mixed old and new optional fields." targeted_failure_modes: - "cross_file_mismatch" - "regression" - "test_overfit" - "poor_maintainability" - id: "maint_add_authorization" title: "Add authorization checks to existing API" tier: "maintenance" category: "maintenance_refactoring" modes: ["maintenance"] prompt_summary: >- Given an existing API service, add role-based authorization to selected operations without changing unrelated behavior or leaking resource existence across unauthorized boundaries. required_capabilities: - "security policy modeling" - "minimal patching" - "regression avoidance" - "test updates" acceptance: functional: - "Allows authorized roles." - "Rejects unauthorized roles with specified response." - "Preserves existing successful behavior." - "Adds focused tests for authorization." security: - "Does not leak secret fields." - "Does not reveal forbidden resource existence when policy forbids it." edge_cases: - "Missing authentication context." - "Unknown role." - "Admin override according to fixture rule." targeted_failure_modes: - "security_vulnerability" - "authorization_bypass" - "regression" - "cross_file_mismatch" - id: "maint_optimize_without_regression" title: "Optimize slow grouping code" tier: "maintenance" category: "maintenance_refactoring" modes: ["maintenance"] prompt_summary: >- Given correct but slow grouping/reporting code, improve performance while preserving output ordering, public API, and edge-case behavior. required_capabilities: - "performance reasoning" - "regression testing" - "minimal refactoring" - "deterministic output" acceptance: functional: - "All prior behavior remains unchanged." - "Performance fixture meets target." - "Output ordering remains stable." - "Public API remains compatible." maintainability: - "Optimization is understandable and localized." - "No broad rewrites unrelated to the bottleneck." edge_cases: - "Empty groups." - "Large repeated keys." - "Unicode or locale-neutral sorting as specified." targeted_failure_modes: - "performance_timeout" - "regression" - "nondeterminism" - "poor_maintainability"