"""Fetch quality-scale tiers for Home Assistant integrations. For each integration domain this script reads ``homeassistant/components//manifest.json`` from the core repo via ``raw.githubusercontent.com`` (no API rate limit; we self-throttle to be polite) and records the ``quality_scale`` field plus a few useful manifest attributes. With ``--rules`` it additionally fetches the integration's ``quality_scale.yaml`` and counts rule statuses (done / exempt / todo). Domain lists: * ``--domains-csv`` -- a CSV with a ``domain`` column (e.g. the output of ``ha-fetch-analytics``), optionally truncated with ``--limit``. * ``--all`` -- enumerate every directory under ``homeassistant/components`` via the GitHub git-trees API (3 API calls, then one raw fetch per component; ~2500 fetches, cached on disk, so the first full run takes roughly 10-15 minutes and re-runs are instant). """ from __future__ import annotations import argparse import json import sys import time from collections import Counter from pathlib import Path import yaml from .common import ( FileCache, cached_get, github_headers, make_session, read_domains_csv, write_csv, ) RAW_URL = ("https://raw.githubusercontent.com/home-assistant/core/" "{branch}/homeassistant/components/{domain}/{filename}") API_BASE = "https://api.github.com/repos/home-assistant/core" POLITENESS_DELAY = 0.15 # seconds between uncached raw fetches def list_all_components(session, cache: FileCache, branch: str) -> list[str]: """Enumerate component directories using the git-trees API (non-recursive, three small requests; avoids the 1000-entry contents-API truncation and the multi-hundred-MB tarball download).""" headers = github_headers() status, text = cached_get( session, cache, f"{API_BASE}/branches/{branch}", headers=headers) if status != 200: raise RuntimeError(f"branches/{branch} -> HTTP {status}") commit_sha = json.loads(text)["commit"]["sha"] def tree_entries(sha: str) -> list[dict]: st, body = cached_get( session, cache, f"{API_BASE}/git/trees/{sha}", headers=headers) if st != 200: raise RuntimeError(f"git/trees/{sha} -> HTTP {st}") return json.loads(body)["tree"] # commit -> root tree st, body = cached_get( session, cache, f"{API_BASE}/git/commits/{commit_sha}", headers=headers) if st != 200: raise RuntimeError(f"git/commits/{commit_sha} -> HTTP {st}") root_tree_sha = json.loads(body)["tree"]["sha"] def child_tree_sha(parent_sha: str, name: str) -> str: for entry in tree_entries(parent_sha): if entry["path"] == name and entry["type"] == "tree": return entry["sha"] raise RuntimeError(f"tree entry '{name}' not found under {parent_sha}") ha_sha = child_tree_sha(root_tree_sha, "homeassistant") components_sha = child_tree_sha(ha_sha, "components") return sorted( entry["path"] for entry in tree_entries(components_sha) if entry["type"] == "tree" ) def _parse_rules(yaml_text: str) -> Counter: """Count rule statuses in a quality_scale.yaml body. Rule values are either a bare string status ('done', 'exempt', 'todo') or a mapping with a 'status' key (and an optional 'comment'). """ counts: Counter = Counter() doc = yaml.safe_load(yaml_text) if not isinstance(doc, dict): return counts rules = doc.get("rules") if not isinstance(rules, dict): return counts for value in rules.values(): if isinstance(value, str): counts[value.strip().lower()] += 1 elif isinstance(value, dict): status = str(value.get("status", "unknown")).strip().lower() counts[status] += 1 else: counts["unknown"] += 1 return counts def fetch_one(session, cache: FileCache, branch: str, domain: str, with_rules: bool) -> dict[str, object]: manifest_url = RAW_URL.format( branch=branch, domain=domain, filename="manifest.json") was_cached = cache.get(manifest_url) is not None status, text = cached_get(session, cache, manifest_url) if not was_cached: time.sleep(POLITENESS_DELAY) row: dict[str, object] = { "domain": domain, "quality_scale": "", "integration_type": "", "iot_class": "", "code_owners": "", "rules_done": "", "rules_exempt": "", "rules_todo": "", } if status == 404: row["quality_scale"] = "missing" # not in core (custom/renamed) return row if status != 200: row["quality_scale"] = f"error_http_{status}" return row try: manifest = json.loads(text) except json.JSONDecodeError: row["quality_scale"] = "error_bad_manifest" return row row["quality_scale"] = str(manifest.get("quality_scale", "unscored")) row["integration_type"] = str(manifest.get("integration_type", "")) row["iot_class"] = str(manifest.get("iot_class", "")) owners = manifest.get("codeowners", []) row["code_owners"] = len(owners) if isinstance(owners, list) else "" if with_rules: rules_url = RAW_URL.format( branch=branch, domain=domain, filename="quality_scale.yaml") rules_cached = cache.get(rules_url) is not None r_status, r_text = cached_get(session, cache, rules_url) if not rules_cached: time.sleep(POLITENESS_DELAY) if r_status == 200: try: counts = _parse_rules(r_text) except yaml.YAMLError: counts = Counter() row["rules_done"] = counts.get("done", 0) row["rules_exempt"] = counts.get("exempt", 0) row["rules_todo"] = counts.get("todo", 0) return row def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser( description="Fetch quality-scale tiers from the home-assistant/core repo." ) source = parser.add_mutually_exclusive_group(required=True) source.add_argument("--domains-csv", help="CSV with a 'domain' column to scan") source.add_argument("--all", action="store_true", help="scan every integration in core (slow first run)") parser.add_argument("--limit", type=int, default=0, help="cap the number of domains scanned (0 = no cap)") parser.add_argument("--branch", default="dev", help="core branch/tag to read (default: %(default)s)") parser.add_argument("--rules", action="store_true", help="also fetch quality_scale.yaml rule statuses") parser.add_argument("--out", default="data/generated/quality_scale_raw.csv") parser.add_argument("--distribution-out", default="", help="optional CSV of tier -> count aggregation") parser.add_argument("--cache-dir", default=str(Path("tools") / ".cache")) parser.add_argument("--cache-ttl", type=float, default=7 * 24 * 3600, help="cache TTL in seconds (default: 7 days)") args = parser.parse_args(argv) session = make_session() cache = FileCache(args.cache_dir, ttl_seconds=args.cache_ttl) try: if args.all: domains = list_all_components(session, cache, args.branch) if args.limit: domains = domains[: args.limit] else: domains = read_domains_csv(args.domains_csv, limit=args.limit) except (RuntimeError, ValueError, OSError, json.JSONDecodeError) as exc: print(f"error: {exc}", file=sys.stderr) return 1 print(f"scanning {len(domains)} integrations on branch '{args.branch}'" f"{' (with rule detail)' if args.rules else ''} ...") rows = [] for index, domain in enumerate(domains, start=1): rows.append(fetch_one(session, cache, args.branch, domain, args.rules)) if index % 100 == 0: print(f" {index}/{len(domains)} done") fields = ["domain", "quality_scale", "integration_type", "iot_class", "code_owners", "rules_done", "rules_exempt", "rules_todo"] written = write_csv(args.out, fields, rows) print(f"wrote {written} rows to {args.out}") if args.distribution_out: tiers = Counter(str(row["quality_scale"]) for row in rows) total = sum(tiers.values()) or 1 dist_rows = [ {"tier": tier, "count": count, "share_pct": f"{100.0 * count / total:.1f}"} for tier, count in sorted(tiers.items(), key=lambda kv: (-kv[1], kv[0])) ] write_csv(args.distribution_out, ["tier", "count", "share_pct"], dist_rows) print(f"wrote tier distribution to {args.distribution_out}") return 0 if __name__ == "__main__": raise SystemExit(main())