"""MNC-1 canonical encoding. Every signed or content-addressed object in Mnema is encoded with a single deterministic encoding so that independent implementations produce byte-identical signing inputs and identical content hashes. MNC-1 rules ----------- 1. The value space is: ``null``, booleans, 64-bit signed integers, UTF-8 strings, arrays, and objects with string keys. 2. **Floats are forbidden.** Fractional quantities (e.g. confidence) are carried as scaled integers (parts-per-million) so that hashing never depends on float formatting. 3. Object keys are sorted by Unicode code point (equivalently, by UTF-8 byte order — UTF-8 preserves code-point ordering). 4. No insignificant whitespace; separators are ``,`` and ``:``. 5. Strings are emitted with minimal JSON escaping and encoded as UTF-8 (``ensure_ascii`` is off). Lone surrogates are rejected. 6. Nesting depth is limited to 64. The encoding is a strict subset of JSON, so any JSON parser can read it; only the *encoder* must be MNC-1 aware. """ from __future__ import annotations import json from typing import Any __all__ = ["CanonicalEncodingError", "canonical_dumps", "canonical_bytes"] class CanonicalEncodingError(ValueError): """Raised when an object cannot be canonically encoded under MNC-1.""" _MAX_DEPTH = 64 _INT_MAX = (1 << 63) - 1 _INT_MIN = -(1 << 63) def _validate(obj: Any, depth: int = 0, path: str = "$") -> None: if depth > _MAX_DEPTH: raise CanonicalEncodingError(f"{path}: nesting exceeds MNC-1 depth limit of {_MAX_DEPTH}") if obj is None or isinstance(obj, bool): return # NOTE: bool is a subclass of int in Python, so the bool check above # must come first. if isinstance(obj, int): if not (_INT_MIN <= obj <= _INT_MAX): raise CanonicalEncodingError(f"{path}: integer out of 64-bit signed range") return if isinstance(obj, float): raise CanonicalEncodingError( f"{path}: floats are forbidden in MNC-1; encode fractional values " "as scaled integers (e.g. confidence_ppm)" ) if isinstance(obj, str): try: obj.encode("utf-8") except UnicodeEncodeError as exc: raise CanonicalEncodingError(f"{path}: string is not valid UTF-8: {exc}") from exc return if isinstance(obj, (list, tuple)): for i, item in enumerate(obj): _validate(item, depth + 1, f"{path}[{i}]") return if isinstance(obj, dict): for key, value in obj.items(): if not isinstance(key, str): raise CanonicalEncodingError( f"{path}: object keys must be strings, got {type(key).__name__!r}" ) try: key.encode("utf-8") except UnicodeEncodeError as exc: raise CanonicalEncodingError(f"{path}: key is not valid UTF-8: {exc}") from exc _validate(value, depth + 1, f"{path}.{key}") return raise CanonicalEncodingError( f"{path}: type {type(obj).__name__!r} is not representable in MNC-1" ) def canonical_dumps(obj: Any) -> str: """Encode *obj* as an MNC-1 canonical string. Raises :class:`CanonicalEncodingError` if *obj* is outside the MNC-1 value space (floats, non-string keys, excessive depth, oversized integers, invalid UTF-8). """ _validate(obj) return json.dumps( obj, sort_keys=True, separators=(",", ":"), ensure_ascii=False, allow_nan=False, ) def canonical_bytes(obj: Any) -> bytes: """Encode *obj* as MNC-1 canonical UTF-8 bytes (the signing/hash input).""" return canonical_dumps(obj).encode("utf-8")