Skip to content

geno_lewm.provenance

provenance

Artifact provenance primitives for GenoLeWM.

This is the preferred public import path for manifests, hashes, input/output commitments, and checksum receipts. The package does not implement or claim runtime assurance beyond checksum provenance.

DtypeConfig dataclass

DtypeConfig(encoder_dtype: str, predictor_dtype: str)

Numerical-precision commit shape.

PoolingConfig dataclass

PoolingConfig(state_layer: int, pool_type: str, pool_radius: int, normalize: bool)

State-encoder pooling configuration commit shape (RFC-0002).

Manifest dataclass

Manifest(schema_version: str, model_name: str, model_version: str, release_id: str, encoder: ManifestEncoder, predictor: ManifestArtifact, action_encoder: ManifestArtifact, calibration: ManifestArtifact, training: ManifestTraining, eval: ManifestArtifact)

Top-level manifest (RFC-0011 §3.7).

to_canonical_dict

to_canonical_dict() -> dict[str, Any]

Return the dict mirror used for canonical JSON.

Dataclass asdict walks nested frozen dataclasses and is deterministic, so the result is byte-stable when fed to the canonical JSON encoder (which also sorts keys).

Source code in geno_lewm/provenance/manifest.py
def to_canonical_dict(self) -> dict[str, Any]:
    """Return the dict mirror used for canonical JSON.

    Dataclass ``asdict`` walks nested frozen dataclasses and is
    deterministic, so the result is byte-stable when fed to the
    canonical JSON encoder (which also sorts keys).
    """
    return asdict(self)

model_id

model_id() -> str

Return model_id = SHA-256(canonical_json(manifest)).

Source code in geno_lewm/provenance/manifest.py
def model_id(self) -> str:
    """Return ``model_id = SHA-256(canonical_json(manifest))``."""
    return canonical_json_sha256(self.to_canonical_dict())

ManifestArtifact dataclass

ManifestArtifact(file: str, hash: str, dtype: str | None = None, version: str | None = None)

A single artifact file referenced by the manifest.

ManifestEncoder dataclass

ManifestEncoder(id: str, revision: str, hash: str)

Carbon encoder identity.

ManifestTraining dataclass

ManifestTraining(config_file: str, hash: str, data_snapshot: dict[str, str] = dict())

Training config + data-snapshot identifiers.

Receipt dataclass

Receipt(schema_version: str, model_id: str, input_commitment: str, output: ReceiptOutput, output_commitment: str, calibration_hash: str, runtime: ReceiptRuntime, timestamp: str, provenance: ReceiptProvenance)

Top-level receipt (RFC-0011 §3.3).

ReceiptOutput dataclass

ReceiptOutput(sigma_raw: float, sigma_calibrated: float, bucket_id: str, confidence: float, low_confidence: bool)

Score-call output committed by the receipt.

ReceiptProvenance dataclass

ReceiptProvenance(kind: str, details: dict[str, Any] | None = None)

Checksum provenance block serialized as provenance in v1 JSON.

ReceiptRuntime dataclass

ReceiptRuntime(backend: str, device: str, geno_lewm_version: str, carbon_revision: str)

Runtime / environment block.

compute_input_commitment

compute_input_commitment(reference_window: str, edit_spec: EditSpec, pooling_config: PoolingConfig, dtype_config: DtypeConfig) -> str

Return the "sha256:<hex>" input commitment for a scoring call.

The canonical payload is a dict with fixed keys; canonical-JSON encoding handles ordering and stability.

Source code in geno_lewm/provenance/commitment.py
def compute_input_commitment(
    reference_window: str,
    edit_spec: EditSpec,
    pooling_config: PoolingConfig,
    dtype_config: DtypeConfig,
) -> str:
    """Return the ``"sha256:<hex>"`` input commitment for a scoring call.

    The canonical payload is a dict with fixed keys; canonical-JSON
    encoding handles ordering and stability.
    """
    if not isinstance(reference_window, str):
        raise InputError(
            "reference_window must be a string of bases",
            details={"type": type(reference_window).__name__},
        )
    if not reference_window:
        raise InputError(
            "reference_window must be non-empty",
            details={"len": 0},
        )

    payload = {
        "reference_window": reference_window,
        "edit_spec": _editspec_to_commit_dict(edit_spec),
        "pooling_config": {
            "state_layer": pooling_config.state_layer,
            "pool_type": pooling_config.pool_type,
            "pool_radius": pooling_config.pool_radius,
            "normalize": pooling_config.normalize,
        },
        "dtype_config": {
            "encoder_dtype": dtype_config.encoder_dtype,
            "predictor_dtype": dtype_config.predictor_dtype,
        },
        "version": 1,
    }
    return canonical_json_sha256(payload)

canonical_json_sha256

canonical_json_sha256(value: Any) -> str

Return "sha256:<hex>" for the canonical JSON of value.

Source code in geno_lewm/provenance/hashing.py
def canonical_json_sha256(value: Any) -> str:
    """Return ``"sha256:<hex>"`` for the canonical JSON of ``value``."""
    return _PREFIX + hashlib.sha256(canonical_json_bytes(value)).hexdigest()

sha256_bytes

sha256_bytes(data: bytes | bytearray | memoryview) -> str

Return "sha256:<hex>" for data.

Source code in geno_lewm/provenance/hashing.py
def sha256_bytes(data: bytes | bytearray | memoryview) -> str:
    """Return ``"sha256:<hex>"`` for ``data``."""
    return _PREFIX + hashlib.sha256(bytes(data)).hexdigest()

sha256_file

sha256_file(path: str | Path) -> str

Return "sha256:<hex>" for the file at path.

Streams the file in 1 MiB chunks; safe for arbitrarily large artifacts (weights files can be multi-GB).

Source code in geno_lewm/provenance/hashing.py
def sha256_file(path: str | Path) -> str:
    """Return ``"sha256:<hex>"`` for the file at ``path``.

    Streams the file in 1 MiB chunks; safe for arbitrarily large
    artifacts (weights files can be multi-GB).
    """
    p = Path(path)
    h = hashlib.sha256()
    with p.open("rb") as f:
        while True:
            chunk = f.read(_CHUNK)
            if not chunk:
                break
            h.update(chunk)
    return _PREFIX + h.hexdigest()

load_manifest

load_manifest(path: str | Path) -> Manifest

Load and validate a manifest from disk.

Source code in geno_lewm/provenance/manifest.py
def load_manifest(path: str | Path) -> Manifest:
    """Load and validate a manifest from disk."""
    p = Path(path)
    raw = p.read_bytes()
    try:
        d = json.loads(raw.decode("utf-8"))
    except json.JSONDecodeError as exc:
        raise InputError(
            "manifest is not valid JSON",
            details={"path": str(p), "error": str(exc)},
        ) from exc
    return _from_dict(d)

write_manifest

write_manifest(manifest: Manifest, path: str | Path) -> Path

Write a manifest to disk as canonical JSON.

The on-disk bytes are byte-stable across platforms.

Source code in geno_lewm/provenance/manifest.py
def write_manifest(manifest: Manifest, path: str | Path) -> Path:
    """Write a manifest to disk as canonical JSON.

    The on-disk bytes are byte-stable across platforms.
    """
    p = Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    p.write_bytes(manifest.to_canonical_json())
    return p

compute_output_commitment

compute_output_commitment(output: ReceiptOutput) -> str

Compute the output-commitment hash for an output block.

Separated from Receipt so callers can pre-compute the commitment before assembling the receipt.

Source code in geno_lewm/provenance/receipt.py
def compute_output_commitment(output: ReceiptOutput) -> str:
    """Compute the output-commitment hash for an output block.

    Separated from ``Receipt`` so callers can pre-compute the
    commitment before assembling the receipt.
    """
    return canonical_json_sha256(asdict(output))

parse_receipt_payload

parse_receipt_payload(payload: Any) -> Receipt

Validate a decoded receipt payload.

Source code in geno_lewm/provenance/receipt.py
def parse_receipt_payload(payload: Any) -> Receipt:
    """Validate a decoded receipt payload."""
    if not isinstance(payload, dict):
        raise ReceiptSchemaError(
            "receipt top-level must be an object",
            details={"type": type(payload).__name__},
        )

    _require_keys("receipt", payload, _REQUIRED_TOP)

    out = payload["output"]
    if not isinstance(out, dict):
        raise ReceiptSchemaError("output must be an object", details={"got": type(out).__name__})
    _require_keys("output", out, _REQUIRED_OUTPUT)

    rt = payload["runtime"]
    if not isinstance(rt, dict):
        raise ReceiptSchemaError("runtime must be an object", details={"got": type(rt).__name__})
    _require_keys("runtime", rt, _REQUIRED_RUNTIME)

    provenance = payload["provenance"]
    if (
        not isinstance(provenance, dict)
        or set(provenance) - {"kind", "details"}
        or "kind" not in provenance
    ):
        raise ReceiptSchemaError(
            "receipt provenance must be an object with 'kind' (and optional 'details')",
            details={"provenance": provenance},
        )

    return Receipt(
        schema_version=payload["schema_version"],
        model_id=payload["model_id"],
        input_commitment=payload["input_commitment"],
        output=ReceiptOutput(
            sigma_raw=out["sigma_raw"],
            sigma_calibrated=out["sigma_calibrated"],
            bucket_id=out["bucket_id"],
            confidence=out["confidence"],
            low_confidence=out["low_confidence"],
        ),
        output_commitment=payload["output_commitment"],
        calibration_hash=payload["calibration_hash"],
        runtime=ReceiptRuntime(
            backend=rt["backend"],
            device=rt["device"],
            geno_lewm_version=rt["geno_lewm_version"],
            carbon_revision=rt["carbon_revision"],
        ),
        timestamp=payload["timestamp"],
        provenance=ReceiptProvenance(
            kind=provenance["kind"],
            details=provenance.get("details"),
        ),
    )

read_receipt

read_receipt(path: str | Path) -> Receipt

Load and validate a receipt from disk.

Source code in geno_lewm/provenance/receipt.py
def read_receipt(path: str | Path) -> Receipt:
    """Load and validate a receipt from disk."""
    p = Path(path)
    raw = p.read_bytes()
    try:
        d = json.loads(raw.decode("utf-8"))
    except json.JSONDecodeError as exc:
        raise ReceiptSchemaError(
            "receipt is not valid JSON",
            details={"path": str(p), "error": str(exc)},
        ) from exc

    if not isinstance(d, dict):
        raise ReceiptSchemaError(
            "receipt top-level must be an object",
            details={"path": str(p), "type": type(d).__name__},
        )
    return parse_receipt_payload(d)

write_receipt

write_receipt(receipt: Receipt, path: str | Path) -> Path

Write a receipt as canonical JSON; round-trip byte-stable.

Source code in geno_lewm/provenance/receipt.py
def write_receipt(receipt: Receipt, path: str | Path) -> Path:
    """Write a receipt as canonical JSON; round-trip byte-stable."""
    p = Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    p.write_bytes(receipt.to_canonical_json())
    return p