Skip to content

geno_lewm.evaluation

evaluation

Artifact-level evaluation helpers for first-experiment score files.

VariantKey dataclass

VariantKey(chrom: str, pos: int, ref: str, alt: str)

Comparable SNV key shared by score and label artifacts.

to_dict

to_dict() -> dict[str, str | int]

Return the JSON-native variant-key payload.

Source code in geno_lewm/evaluation.py
def to_dict(self) -> dict[str, str | int]:
    """Return the JSON-native variant-key payload."""
    return {"chrom": self.chrom, "pos": self.pos, "ref": self.ref, "alt": self.alt}

BinaryEvalResult dataclass

BinaryEvalResult(split: str, score_field: str, threshold: float, labelled_variants: int, evaluated_variants: int, positive_variants: int, negative_variants: int, extra_score_variants: int, auroc: float, average_precision: float, accuracy: float, balanced_accuracy: float, sensitivity: float, specificity: float, ci_level: float, bootstrap_resamples: int, bootstrap_seed: int, auroc_ci_low: float | None = None, auroc_ci_high: float | None = None, average_precision_ci_low: float | None = None, average_precision_ci_high: float | None = None, accuracy_ci_low: float | None = None, accuracy_ci_high: float | None = None, balanced_accuracy_ci_low: float | None = None, balanced_accuracy_ci_high: float | None = None)

Measured binary evaluation over labelled score records.

evaluated_variant_keys_sha256 property

evaluated_variant_keys_sha256: str

SHA-256 identity of the sorted evaluated variant-key set, when known.

to_report_metrics

to_report_metrics() -> list[dict[str, object]]

Render metrics in tools.release.eval_report input shape.

Source code in geno_lewm/evaluation.py
def to_report_metrics(self) -> list[dict[str, object]]:
    """Render metrics in ``tools.release.eval_report`` input shape."""
    notes = f"positive=P/LP and negative=B/LB ClinVar labels; scores use {self.score_field}"
    variant_hash = _evaluated_variant_hash_payload(self.evaluated_variant_keys_sha256)
    metrics = [
        {
            "name": "auroc",
            "value": self.auroc,
            "split": self.split,
            "unit": "area",
            "higher_is_better": True,
            "n": self.evaluated_variants,
            "notes": notes,
            **variant_hash,
            **_ci_payload(self.auroc_ci_low, self.auroc_ci_high),
        },
        {
            "name": "average_precision",
            "value": self.average_precision,
            "split": self.split,
            "unit": "area",
            "higher_is_better": True,
            "n": self.evaluated_variants,
            "notes": notes,
            **variant_hash,
            **_ci_payload(
                self.average_precision_ci_low,
                self.average_precision_ci_high,
            ),
        },
        {
            "name": "balanced_accuracy",
            "value": self.balanced_accuracy,
            "split": self.split,
            "unit": "fraction",
            "higher_is_better": True,
            "n": self.evaluated_variants,
            "notes": f"threshold={self.threshold:g}",
            **variant_hash,
            **_ci_payload(
                self.balanced_accuracy_ci_low,
                self.balanced_accuracy_ci_high,
            ),
        },
        {
            "name": "accuracy",
            "value": self.accuracy,
            "split": self.split,
            "unit": "fraction",
            "higher_is_better": True,
            "n": self.evaluated_variants,
            "notes": f"threshold={self.threshold:g}",
            **variant_hash,
            **_ci_payload(self.accuracy_ci_low, self.accuracy_ci_high),
        },
    ]
    ci_note = self._ci_note()
    for metric in metrics:
        metric["notes"] = f"{metric['notes']}; {ci_note}"
    return metrics

to_summary_dict

to_summary_dict() -> dict[str, object]

Return a compact JSON summary for CLI stdout.

Source code in geno_lewm/evaluation.py
def to_summary_dict(self) -> dict[str, object]:
    """Return a compact JSON summary for CLI stdout."""
    return {
        "split": self.split,
        "score_field": self.score_field,
        "threshold": self.threshold,
        "labelled_variants": self.labelled_variants,
        "evaluated_variants": self.evaluated_variants,
        "evaluated_variant_keys_sha256": self.evaluated_variant_keys_sha256 or None,
        "positive_variants": self.positive_variants,
        "negative_variants": self.negative_variants,
        "extra_score_variants": self.extra_score_variants,
        "auroc": self.auroc,
        "average_precision": self.average_precision,
        "accuracy": self.accuracy,
        "balanced_accuracy": self.balanced_accuracy,
        "sensitivity": self.sensitivity,
        "specificity": self.specificity,
        "ci_level": self.ci_level,
        "bootstrap_resamples": self.bootstrap_resamples,
        "bootstrap_seed": self.bootstrap_seed,
        "auroc_ci": _ci_summary(self.auroc_ci_low, self.auroc_ci_high),
        "average_precision_ci": _ci_summary(
            self.average_precision_ci_low,
            self.average_precision_ci_high,
        ),
        "accuracy_ci": _ci_summary(self.accuracy_ci_low, self.accuracy_ci_high),
        "balanced_accuracy_ci": _ci_summary(
            self.balanced_accuracy_ci_low,
            self.balanced_accuracy_ci_high,
        ),
    }

ContinuousEvalResult dataclass

ContinuousEvalResult(split: str, score_field: str, label_field: str, labelled_variants: int, evaluated_variants: int, extra_score_variants: int, spearman_rho: float, ci_level: float, bootstrap_resamples: int, bootstrap_seed: int, spearman_rho_ci_low: float | None = None, spearman_rho_ci_high: float | None = None)

Measured continuous-label evaluation over matched score records.

evaluated_variant_keys_sha256 property

evaluated_variant_keys_sha256: str

SHA-256 identity of the sorted evaluated variant-key set, when known.

to_report_metrics

to_report_metrics() -> list[dict[str, object]]

Render Spearman correlation in tools.release.eval_report input shape.

Source code in geno_lewm/evaluation.py
def to_report_metrics(self) -> list[dict[str, object]]:
    """Render Spearman correlation in ``tools.release.eval_report`` input shape."""
    notes = (
        f"continuous labels use {self.label_field}; scores use {self.score_field}; "
        f"{self._ci_note()}"
    )
    return [
        {
            "name": "spearman_rho",
            "value": self.spearman_rho,
            "split": self.split,
            "unit": "correlation",
            "higher_is_better": True,
            "n": self.evaluated_variants,
            "notes": notes,
            **_evaluated_variant_hash_payload(self.evaluated_variant_keys_sha256),
            **_ci_payload(self.spearman_rho_ci_low, self.spearman_rho_ci_high),
        }
    ]

to_summary_dict

to_summary_dict() -> dict[str, object]

Return a compact JSON summary for CLI stdout.

Source code in geno_lewm/evaluation.py
def to_summary_dict(self) -> dict[str, object]:
    """Return a compact JSON summary for CLI stdout."""
    return {
        "split": self.split,
        "score_field": self.score_field,
        "label_field": self.label_field,
        "labelled_variants": self.labelled_variants,
        "evaluated_variants": self.evaluated_variants,
        "evaluated_variant_keys_sha256": self.evaluated_variant_keys_sha256 or None,
        "extra_score_variants": self.extra_score_variants,
        "spearman_rho": self.spearman_rho,
        "ci_level": self.ci_level,
        "bootstrap_resamples": self.bootstrap_resamples,
        "bootstrap_seed": self.bootstrap_seed,
        "spearman_rho_ci": _ci_summary(
            self.spearman_rho_ci_low,
            self.spearman_rho_ci_high,
        ),
    }

evaluate_score_labels

evaluate_score_labels(scores_jsonl: str | Path, labels_jsonl: str | Path, *, score_field: str = DEFAULT_EVAL_SCORE_FIELD, threshold: float = DEFAULT_EVAL_THRESHOLD, split: str = 'eval_clinvar', bootstrap_resamples: int = DEFAULT_BOOTSTRAP_RESAMPLES, bootstrap_seed: int = DEFAULT_BOOTSTRAP_SEED, ci_level: float = DEFAULT_CI_LEVEL) -> BinaryEvalResult

Evaluate score JSONL against held-out ClinVar-style label JSONL.

Source code in geno_lewm/evaluation.py
def evaluate_score_labels(
    scores_jsonl: str | Path,
    labels_jsonl: str | Path,
    *,
    score_field: str = DEFAULT_EVAL_SCORE_FIELD,
    threshold: float = DEFAULT_EVAL_THRESHOLD,
    split: str = "eval_clinvar",
    bootstrap_resamples: int = DEFAULT_BOOTSTRAP_RESAMPLES,
    bootstrap_seed: int = DEFAULT_BOOTSTRAP_SEED,
    ci_level: float = DEFAULT_CI_LEVEL,
) -> BinaryEvalResult:
    """Evaluate score JSONL against held-out ClinVar-style label JSONL."""
    if not isinstance(score_field, str) or not score_field:
        raise InputError("score_field must be a non-empty string")
    _require_finite_number("threshold", threshold)
    if not isinstance(split, str) or not split.strip():
        raise InputError("split must be a non-empty string")
    resamples = _require_non_negative_int("bootstrap_resamples", bootstrap_resamples)
    seed = _require_int("bootstrap_seed", bootstrap_seed)
    level = _require_ci_level(ci_level)

    labels = _load_label_map(Path(labels_jsonl))
    scores = _load_score_map(Path(scores_jsonl), score_field=score_field)
    missing = sorted(set(labels) - set(scores))
    if missing:
        raise InputError(
            "scores are missing labelled variants",
            details={"missing": [key.to_dict() for key in missing[:10]], "count": len(missing)},
            remediation="score every held-out labelled variant before generating metrics",
        )

    keys = sorted(labels)
    y_true = [labels[key] for key in keys]
    y_score = [scores[key] for key in keys]
    positives = sum(1 for value in y_true if value)
    negatives = len(y_true) - positives
    if positives == 0 or negatives == 0:
        raise InputError(
            "label evaluation requires at least one positive and one negative variant",
            details={"positive": positives, "negative": negatives},
        )

    values = _binary_metric_values(y_true, y_score, threshold=threshold)
    intervals = _bootstrap_intervals(
        y_true,
        y_score,
        threshold=threshold,
        resamples=resamples,
        seed=seed,
        ci_level=level,
    )
    return _with_evaluated_variant_keys_sha256(
        BinaryEvalResult(
            split=split.strip(),
            score_field=score_field,
            threshold=float(threshold),
            labelled_variants=len(labels),
            evaluated_variants=len(y_true),
            positive_variants=positives,
            negative_variants=negatives,
            extra_score_variants=len(set(scores) - set(labels)),
            auroc=values.auroc,
            average_precision=values.average_precision,
            accuracy=values.accuracy,
            balanced_accuracy=values.balanced_accuracy,
            sensitivity=values.sensitivity,
            specificity=values.specificity,
            ci_level=level,
            bootstrap_resamples=resamples,
            bootstrap_seed=seed,
            auroc_ci_low=intervals.get("auroc", (None, None))[0],
            auroc_ci_high=intervals.get("auroc", (None, None))[1],
            average_precision_ci_low=intervals.get("average_precision", (None, None))[0],
            average_precision_ci_high=intervals.get("average_precision", (None, None))[1],
            accuracy_ci_low=intervals.get("accuracy", (None, None))[0],
            accuracy_ci_high=intervals.get("accuracy", (None, None))[1],
            balanced_accuracy_ci_low=intervals.get("balanced_accuracy", (None, None))[0],
            balanced_accuracy_ci_high=intervals.get("balanced_accuracy", (None, None))[1],
        ),
        _variant_keys_sha256(keys),
    )

evaluate_continuous_score_labels

evaluate_continuous_score_labels(scores_jsonl: str | Path, labels_jsonl: str | Path, *, score_field: str = DEFAULT_EVAL_SCORE_FIELD, label_field: str = 'value', split: str = 'eval_continuous', bootstrap_resamples: int = DEFAULT_BOOTSTRAP_RESAMPLES, bootstrap_seed: int = DEFAULT_BOOTSTRAP_SEED, ci_level: float = DEFAULT_CI_LEVEL) -> ContinuousEvalResult

Evaluate score JSONL against held-out continuous labels using Spearman rho.

Source code in geno_lewm/evaluation.py
def evaluate_continuous_score_labels(
    scores_jsonl: str | Path,
    labels_jsonl: str | Path,
    *,
    score_field: str = DEFAULT_EVAL_SCORE_FIELD,
    label_field: str = "value",
    split: str = "eval_continuous",
    bootstrap_resamples: int = DEFAULT_BOOTSTRAP_RESAMPLES,
    bootstrap_seed: int = DEFAULT_BOOTSTRAP_SEED,
    ci_level: float = DEFAULT_CI_LEVEL,
) -> ContinuousEvalResult:
    """Evaluate score JSONL against held-out continuous labels using Spearman rho."""
    if not isinstance(score_field, str) or not score_field:
        raise InputError("score_field must be a non-empty string")
    if not isinstance(label_field, str) or not label_field:
        raise InputError("label_field must be a non-empty string")
    if not isinstance(split, str) or not split.strip():
        raise InputError("split must be a non-empty string")
    resamples = _require_non_negative_int("bootstrap_resamples", bootstrap_resamples)
    seed = _require_int("bootstrap_seed", bootstrap_seed)
    level = _require_ci_level(ci_level)

    labels = _load_continuous_label_map(Path(labels_jsonl), label_field=label_field)
    scores = _load_score_map(Path(scores_jsonl), score_field=score_field)
    missing = sorted(set(labels) - set(scores))
    if missing:
        raise InputError(
            "scores are missing labelled variants",
            details={"missing": [key.to_dict() for key in missing[:10]], "count": len(missing)},
            remediation="score every held-out labelled variant before generating metrics",
        )

    keys = sorted(labels)
    y_true = [labels[key] for key in keys]
    y_score = [scores[key] for key in keys]
    if len(keys) < 2:
        raise InputError("continuous evaluation requires at least two matched variants")
    rho = _spearman_rho(y_true, y_score)
    intervals = _continuous_bootstrap_intervals(
        y_true,
        y_score,
        resamples=resamples,
        seed=seed,
        ci_level=level,
    )
    return _with_continuous_evaluated_variant_keys_sha256(
        ContinuousEvalResult(
            split=split.strip(),
            score_field=score_field,
            label_field=label_field,
            labelled_variants=len(labels),
            evaluated_variants=len(keys),
            extra_score_variants=len(set(scores) - set(labels)),
            spearman_rho=rho,
            ci_level=level,
            bootstrap_resamples=resamples,
            bootstrap_seed=seed,
            spearman_rho_ci_low=intervals.get("spearman_rho", (None, None))[0],
            spearman_rho_ci_high=intervals.get("spearman_rho", (None, None))[1],
        ),
        _variant_keys_sha256(keys),
    )

build_eval_report_payload

build_eval_report_payload(result: BinaryEvalResult, *, model_id: str, model_release: str, dataset_snapshot: str, commit: str, hardware: str, checkpoint: str | Path, config: str | Path, dataset_manifest: str | Path, eval_config: str | Path, efficiency_report: str | Path, scores: str | Path, labels: str | Path, baseline_result: BinaryEvalResult | None = None, baseline_name: str | None = None, baseline_scores: str | Path | None = None, generated_at: str | None = None) -> dict[str, object]

Build measured metrics JSON accepted by tools.release.eval_report.

Source code in geno_lewm/evaluation.py
def build_eval_report_payload(
    result: BinaryEvalResult,
    *,
    model_id: str,
    model_release: str,
    dataset_snapshot: str,
    commit: str,
    hardware: str,
    checkpoint: str | Path,
    config: str | Path,
    dataset_manifest: str | Path,
    eval_config: str | Path,
    efficiency_report: str | Path,
    scores: str | Path,
    labels: str | Path,
    baseline_result: BinaryEvalResult | None = None,
    baseline_name: str | None = None,
    baseline_scores: str | Path | None = None,
    generated_at: str | None = None,
) -> dict[str, object]:
    """Build measured metrics JSON accepted by ``tools.release.eval_report``."""
    _require_baseline_inputs(
        baseline_result=baseline_result,
        baseline_name=baseline_name,
        baseline_scores=baseline_scores,
    )
    generated = generated_at or datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
    evaluated = result.evaluated_variants
    positives = result.positive_variants
    negatives = result.negative_variants
    return {
        "schema_version": "1.0.0",
        "generated_by": "geno-lewm-eval",
        "generated_at": generated,
        "model_id": _required_text("model_id", model_id),
        "model_release": _required_text("model_release", model_release),
        "dataset_snapshot": _required_text("dataset_snapshot", dataset_snapshot),
        "commit": _required_text("commit", commit),
        "hardware": _required_text("hardware", hardware),
        "metrics": _report_metrics(
            result,
            baseline_result=baseline_result,
            baseline_name=baseline_name,
        ),
        "artifacts": {
            "checkpoint": str(checkpoint),
            "config": str(config),
            "dataset_manifest": str(dataset_manifest),
            "eval_config": str(eval_config),
            "efficiency_report": str(efficiency_report),
            "scores": str(scores),
            "labels": str(labels),
            **({} if baseline_scores is None else {"baseline_scores": str(baseline_scores)}),
        },
        "limitations": [
            (
                "Artifact-level binary ClinVar evaluation only; labels outside P/LP/B/LB "
                "are excluded from the measured set."
            ),
            (
                "Confidence intervals use deterministic stratified bootstrap resampling; "
                "if bootstrap_resamples is zero, metric notes state that intervals were omitted."
            ),
            "The metrics do not establish clinical utility or deployment readiness.",
        ],
        "negative_findings": [
            "This report does not measure non-coding, multi-edit, or prospective clinical utility.",
            "Failures and omitted intervals must be read from the metric notes and limitations.",
        ],
        "conclusions": [
            (
                f"The score artifact was evaluated on {evaluated} labelled variants "
                f"({positives} positive, {negatives} negative) from {result.split}."
            ),
            _summary_conclusion(
                result, baseline_result=baseline_result, baseline_name=baseline_name
            ),
        ],
    }

build_continuous_eval_report_payload

build_continuous_eval_report_payload(result: ContinuousEvalResult, *, model_id: str, model_release: str, dataset_snapshot: str, commit: str, hardware: str, checkpoint: str | Path, config: str | Path, dataset_manifest: str | Path, eval_config: str | Path, efficiency_report: str | Path, scores: str | Path, labels: str | Path, baseline_result: ContinuousEvalResult | None = None, baseline_name: str | None = None, baseline_scores: str | Path | None = None, generated_at: str | None = None) -> dict[str, object]

Build measured continuous metrics JSON accepted by eval_report.

Source code in geno_lewm/evaluation.py
def build_continuous_eval_report_payload(
    result: ContinuousEvalResult,
    *,
    model_id: str,
    model_release: str,
    dataset_snapshot: str,
    commit: str,
    hardware: str,
    checkpoint: str | Path,
    config: str | Path,
    dataset_manifest: str | Path,
    eval_config: str | Path,
    efficiency_report: str | Path,
    scores: str | Path,
    labels: str | Path,
    baseline_result: ContinuousEvalResult | None = None,
    baseline_name: str | None = None,
    baseline_scores: str | Path | None = None,
    generated_at: str | None = None,
) -> dict[str, object]:
    """Build measured continuous metrics JSON accepted by ``eval_report``."""
    _require_baseline_inputs(
        baseline_result=baseline_result,
        baseline_name=baseline_name,
        baseline_scores=baseline_scores,
    )
    generated = generated_at or datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
    return {
        "schema_version": "1.0.0",
        "generated_by": "geno-lewm-eval",
        "generated_at": generated,
        "model_id": _required_text("model_id", model_id),
        "model_release": _required_text("model_release", model_release),
        "dataset_snapshot": _required_text("dataset_snapshot", dataset_snapshot),
        "commit": _required_text("commit", commit),
        "hardware": _required_text("hardware", hardware),
        "metrics": _continuous_report_metrics(
            result,
            baseline_result=baseline_result,
            baseline_name=baseline_name,
        ),
        "artifacts": {
            "checkpoint": str(checkpoint),
            "config": str(config),
            "dataset_manifest": str(dataset_manifest),
            "eval_config": str(eval_config),
            "efficiency_report": str(efficiency_report),
            "scores": str(scores),
            "labels": str(labels),
            **({} if baseline_scores is None else {"baseline_scores": str(baseline_scores)}),
        },
        "limitations": [
            "Artifact-level continuous-label evaluation only; labels are not clinical outcomes.",
            (
                "Confidence intervals use deterministic bootstrap resampling; if "
                "bootstrap_resamples is zero, metric notes state that intervals were omitted."
            ),
            "The metrics do not establish clinical utility or deployment readiness.",
        ],
        "negative_findings": [
            "This report does not measure prospective clinical utility.",
            "Correlation metrics do not establish calibration or causal edit effects.",
        ],
        "conclusions": [
            (
                f"The score artifact was evaluated on {result.evaluated_variants} "
                f"continuous-label variants from {result.split}."
            ),
            _continuous_summary_conclusion(
                result,
                baseline_result=baseline_result,
                baseline_name=baseline_name,
            ),
        ],
    }