"""Deterministic JSON canonicalization, as specified by the PMP wire format. Rules (milestone #2, ``spec/wire-format.md``): * UTF-8 output, no BOM. * Object members sorted by key in Unicode code point order. * Separators are exactly ``,`` and ``:`` -- no insignificant whitespace. * Permitted value types: ``null``, boolean, integer, finite float, string, array, object. Object keys must be strings. * Floats are encoded in Python's ``repr`` form (shortest round-trip representation, the behaviour of :func:`json.dumps` since Python 3.1). ``NaN`` and ``+/-Infinity`` are rejected outright. * Non-ASCII characters are emitted as raw UTF-8 (``ensure_ascii=False``). Two documents with identical canonical bytes are the same document. Every hash and every signature in PMP is computed over canonical bytes, never over whatever happens to be on disk or on the wire. """ from __future__ import annotations import hashlib import json import math import string from typing import Any from pmp.errors import CanonicalizationError HASH_PREFIX = "sha256:" _HEX_DIGITS = set(string.hexdigits.lower()) def _validate(value: Any, path: str = "$") -> None: """Recursively reject anything outside the canonical type set.""" if value is None or isinstance(value, (bool, str)): return if isinstance(value, int): return if isinstance(value, float): if not math.isfinite(value): raise CanonicalizationError(f"non-finite float at {path}") return if isinstance(value, (list, tuple)): for i, item in enumerate(value): _validate(item, f"{path}[{i}]") return if isinstance(value, dict): for key, item in value.items(): if not isinstance(key, str): raise CanonicalizationError( f"non-string object key {key!r} at {path}" ) _validate(item, f"{path}.{key}") return raise CanonicalizationError( f"unsupported type {type(value).__name__} at {path}" ) def canonical_json(value: Any) -> bytes: """Serialize *value* to canonical JSON bytes. Raises :class:`CanonicalizationError` if *value* contains anything outside the canonical type set. """ _validate(value) text = json.dumps( value, sort_keys=True, separators=(",", ":"), ensure_ascii=False, allow_nan=False, ) return text.encode("utf-8") def sha256_hex(data: bytes) -> str: """Lowercase hex SHA-256 digest of raw bytes.""" return hashlib.sha256(data).hexdigest() def canonical_hash(value: Any) -> str: """Hash reference (``sha256:``) over the canonical bytes of *value*.""" return HASH_PREFIX + sha256_hex(canonical_json(value)) def is_hash_ref(value: Any) -> bool: """True if *value* is a well-formed ``sha256:<64 lowercase hex>`` reference.""" if not isinstance(value, str) or not value.startswith(HASH_PREFIX): return False digest = value[len(HASH_PREFIX):] return len(digest) == 64 and all(c in _HEX_DIGITS for c in digest)