Skip to content

geno_lewm.cli.rollout

rollout

geno-lewm-rollout — aggregate measured rollout-fidelity state rows.

RolloutStateRow dataclass

RolloutStateRow(row_id: str, split: str, horizon: int, source_state: tuple[float, ...], predicted_state: tuple[float, ...], target_state: tuple[float, ...], target_rank: int, baseline_target_rank: int)

One measured rollout-fidelity row.

RolloutSummary dataclass

RolloutSummary(split: str, horizon: int | None, n: int, row_identity: str, cosine_similarity_mean: float, naive_baseline_cosine_mean: float, l2_distance_mean: float, naive_baseline_l2_mean: float, recall_at_k: float, naive_baseline_recall_at_k: float)

Aggregate metrics for one split or split/horizon group.

load_rollout_state_rows

load_rollout_state_rows(path: Path) -> tuple[RolloutStateRow, ...]

Load and validate measured rollout-state JSONL.

Source code in geno_lewm/cli/rollout.py
def load_rollout_state_rows(path: Path) -> tuple[RolloutStateRow, ...]:
    """Load and validate measured rollout-state JSONL."""
    rows: list[RolloutStateRow] = []
    try:
        lines = path.read_text(encoding="utf-8").splitlines()
    except OSError as exc:
        raise InputError("failed to read rollout state JSONL", details={"path": str(path)}) from exc
    for line_no, line in enumerate(lines, start=1):
        if not line.strip():
            continue
        try:
            payload = json.loads(line)
        except json.JSONDecodeError as exc:
            raise InputError(
                "rollout state JSONL row is invalid",
                details={"path": str(path), "line": line_no, "column": exc.colno},
            ) from exc
        rows.append(_parse_rollout_row(payload, line_no=line_no))
    if not rows:
        raise InputError("rollout state JSONL must contain at least one measured row")
    duplicates = _duplicates(row.row_id for row in rows)
    if duplicates:
        raise InputError("rollout state row ids must be unique", details={"duplicates": duplicates})
    return tuple(rows)

build_rollout_metrics_payload

build_rollout_metrics_payload(rows: tuple[RolloutStateRow, ...], *, recall_k: int, model_id: str, model_release: str, dataset_snapshot: str, commit: str, hardware: str, artifacts: dict[str, str]) -> dict[str, object]

Build eval-compatible rollout-fidelity metrics from measured state rows.

Source code in geno_lewm/cli/rollout.py
def build_rollout_metrics_payload(
    rows: tuple[RolloutStateRow, ...],
    *,
    recall_k: int,
    model_id: str,
    model_release: str,
    dataset_snapshot: str,
    commit: str,
    hardware: str,
    artifacts: dict[str, str],
) -> dict[str, object]:
    """Build eval-compatible rollout-fidelity metrics from measured state rows."""
    split_summaries = [
        _summarize_group(split, split_rows, recall_k=recall_k)
        for split, split_rows in _group_by_split(rows).items()
    ]
    split_summaries.sort(key=lambda summary: summary.split)
    per_k_summaries = [
        _summarize_group(split, split_rows, recall_k=recall_k, horizon=horizon)
        for (split, horizon), split_rows in _group_by_split_horizon(rows).items()
    ]
    per_k_summaries.sort(key=lambda summary: (summary.split, summary.horizon or 0))
    metrics = [
        metric
        for summary in split_summaries
        for metric in _metrics_for_summary(summary, recall_k=recall_k)
    ]
    return {
        "schema_version": "1.0.0",
        "generated_by": EVAL_GENERATED_BY,
        "generated_at": _utc_now(),
        "model_id": model_id,
        "model_release": model_release,
        "dataset_snapshot": dataset_snapshot,
        "commit": commit,
        "hardware": hardware,
        "metrics": metrics,
        "artifacts": artifacts,
        "rollout_stratification": [
            _stratification_payload(summary, recall_k=recall_k) for summary in per_k_summaries
        ],
        "limitations": [
            (
                "Rollout-fidelity metrics are computed from measured latent-state rows; "
                "this command does not generate held-out haplotypes or run Carbon encoding."
            ),
            (
                f"Recall@{recall_k} uses target-rank evidence supplied by the measured "
                "rollout-state artifact."
            ),
        ],
        "negative_findings": [
            (
                "No clinical utility, privacy, deployment, or runtime-assurance claim is "
                "established by rollout-fidelity metrics."
            )
        ],
        "conclusions": _metric_conclusions(metrics),
    }