Skip to content

geno_lewm

geno_lewm

GenoLeWM — action-conditioned JEPA world model for DNA.

GenoLeWM treats genetic edits as first-class actions and learns latent transitions on top of a frozen DNA foundation model. The package today ships the production infrastructure layer (typed errors, structured observability with privacy redaction, content-addressed provenance receipts, canonical edit specs, the verify CLI). The training, predictor, and deployment surfaces land incrementally — see ROADMAP.md and the rfcs/ corpus.

The public Python surface is enumerated below in __all__. Every symbol's stability is governed by RFC-0014 (api stability) and the tests/api/public_surface.json snapshot is the binding contract.

ActionEncoder

ActionEncoder(*, d_action: int = 512, d_pos: int = 128, d_type: int = 64, d_seq: int = 256, max_window_bp: int = 12288, carbon_tokenizer: Any | None = None)

Bases: Module

Encode :class:RelEdit objects into learned action embeddings.

Source code in geno_lewm/action/encoder.py
def __init__(
    self,
    *,
    d_action: int = 512,
    d_pos: int = 128,
    d_type: int = 64,
    d_seq: int = 256,
    max_window_bp: int = 12_288,
    carbon_tokenizer: Any | None = None,
) -> None:
    super().__init__()
    _require_positive("d_action", d_action)
    _require_positive("d_pos", d_pos)
    _require_positive("d_type", d_type)
    _require_positive("max_window_bp", max_window_bp)
    if d_pos % 2 != 0:
        raise InputError("d_pos must be even for sinusoidal position embeddings")
    self._d_action = d_action
    self.d_pos = d_pos
    self.max_window_bp = max_window_bp
    self.carbon_tokenizer = carbon_tokenizer
    self.type_embedding = nn.Embedding(len(EditType), d_type)
    self.seq_encoder = SeqMicroEncoder(d_seq=d_seq)
    projection_in = d_pos + d_type + (2 * d_seq)
    self.projection = nn.Sequential(
        nn.Linear(projection_in, 1024),
        nn.GELU(),
        nn.LayerNorm(1024),
        nn.Linear(1024, d_action),
    )
    self.padding_embedding = nn.Parameter(torch.zeros(d_action))

EditSpec dataclass

EditSpec(chrom: str, pos: int, ref: str, alt: str, edit_type: EditType = EditType.SNV)

A canonical, frozen genomic edit (RFC-0003 §3.1).

Construct with absolute VCF-style coordinates; the derived :attr:edit_type is filled in by __post_init__.

pos is 1-based per VCF convention; both ref and alt are explicit base strings (no <DEL> / <INS> symbolic alleles — they're deferred to v2).

relative_to

relative_to(window_start_bp: int, window_end_bp: int) -> RelEdit

Return the window-relative form (RFC-0003 §3.3).

window_start_bp and window_end_bp are 0-based inclusive coordinates on the same chromosome as :attr:chrom. The predictor sees only the relative offset; absolute coordinates never enter the model.

Source code in geno_lewm/action/spec.py
def relative_to(self, window_start_bp: int, window_end_bp: int) -> RelEdit:
    """Return the window-relative form (RFC-0003 §3.3).

    ``window_start_bp`` and ``window_end_bp`` are 0-based inclusive
    coordinates on the same chromosome as :attr:`chrom`. The
    predictor sees only the relative offset; absolute coordinates
    never enter the model.
    """
    if window_end_bp < window_start_bp:
        raise InvalidEditError(
            "window_end_bp must be >= window_start_bp",
            details={"start": window_start_bp, "end": window_end_bp},
        )
    rel_pos = self.pos - 1 - window_start_bp  # convert 1-based VCF → 0-based offset
    if rel_pos < 0 or rel_pos + len(self.ref) > (window_end_bp - window_start_bp + 1):
        raise OutOfWindowError(
            "edit falls outside the window",
            details={
                "pos": self.pos,
                "ref_len": len(self.ref),
                "window_start_bp": window_start_bp,
                "window_end_bp": window_end_bp,
                "rel_pos": rel_pos,
            },
            remediation="re-center the encoder window over the edit, or skip the edit",
        )
    return RelEdit(
        rel_pos=rel_pos,
        edit_type=self.edit_type,
        ref_bases=self.ref,
        alt_bases=self.alt,
    )

EditType

Bases: IntEnum

The six v1 edit categories (RFC-0003 §3.2).

Members are deterministic functions of (len(ref), len(alt)) — callers do not pass this value; it is computed during construction.

RelEdit dataclass

RelEdit(rel_pos: int, edit_type: EditType, ref_bases: str, alt_bases: str)

Window-relative form consumed by the action encoder.

BackendProbe dataclass

BackendProbe(backend: str, available: bool, reason: str)

Capability probe result for one runtime backend.

GenoLeWMRuntime

GenoLeWMRuntime(model_dir: str | Path, backend: str = BACKEND_AUTO, *, encoder: object | None = None, action_encoder: object | None = None, predictor: object | None = None, calibration: CalibrationTable | None = None)

Top-level runtime facade for on-device inference workflows.

Source code in geno_lewm/deploy/runtime.py
def __init__(
    self,
    model_dir: str | Path,
    backend: str = BACKEND_AUTO,
    *,
    encoder: object | None = None,
    action_encoder: object | None = None,
    predictor: object | None = None,
    calibration: CalibrationTable | None = None,
) -> None:
    root = Path(model_dir).expanduser()
    if not root.exists() or not root.is_dir():
        raise ModelNotFoundError(
            "model_dir must be an existing directory",
            details={"model_dir": str(root)},
        )
    self.model_dir = root
    self.manifest = _load_runtime_manifest(root)
    if self.manifest is not None:
        _verify_manifest_artifacts(root, self.manifest)
    self.probes = probe_backends(root)
    self.backend = select_backend(backend, probes=self.probes)
    self._scorer = _resolve_scorer_components(
        root,
        self.manifest,
        encoder=encoder,
        action_encoder=action_encoder,
        predictor=predictor,
        calibration=calibration,
    )

score_variant

score_variant(variant: EditSpec, window: str | None = None, *, receipt_path: str | Path | None = None) -> Any

Score a single variant through local scorer components when available.

Source code in geno_lewm/deploy/runtime.py
def score_variant(
    self,
    variant: EditSpec,
    window: str | None = None,
    *,
    receipt_path: str | Path | None = None,
) -> Any:
    """Score a single variant through local scorer components when available."""
    if not isinstance(variant, EditSpec):
        raise InputError(
            "variant must be an EditSpec",
            details={"type": type(variant).__name__},
        )
    normalized_window = None
    if window is not None:
        normalized_window = canonicalize_dna(window)
    scorer = self._scorer
    with fail_closed_network_guard(), torch_inference_context():
        if scorer is not None:
            if normalized_window is None:
                raise InputError(
                    "score_variant requires a reference window",
                    remediation="pass window=... or use score_vcf with a local FASTA",
                )
            result = score_surprise_variant(
                variant,
                scorer.encoder,
                scorer.action_encoder,
                scorer.predictor,
                scorer.calibration,
                reference_window=normalized_window,
            )
            if receipt_path is not None:
                _write_score_variant_receipt(
                    backend=self.backend,
                    model_dir=self.model_dir,
                    manifest=self.manifest,
                    variant=variant,
                    reference_window=normalized_window,
                    result=result,
                    receipt_path=receipt_path,
                )
            return result
        _raise_backend_not_ready("score_variant", self.backend, self.model_dir)

score_vcf

score_vcf(vcf_path: str | Path, fasta_path: str | Path, output_path: str | Path, batch_size: int = 64, progress: bool = True, *, receipt_path: str | Path | None = None) -> None

Score a VCF through local scorer components when available.

When receipt_path is provided, the runtime writes JSONL with one canonical v1 receipt per scored alternate. The v1 schema commits a single output, so this is intentionally not a batch aggregate receipt.

Source code in geno_lewm/deploy/runtime.py
def score_vcf(
    self,
    vcf_path: str | Path,
    fasta_path: str | Path,
    output_path: str | Path,
    batch_size: int = 64,
    progress: bool = True,
    *,
    receipt_path: str | Path | None = None,
) -> None:
    """Score a VCF through local scorer components when available.

    When ``receipt_path`` is provided, the runtime writes JSONL with
    one canonical v1 receipt per scored alternate. The v1 schema
    commits a single output, so this is intentionally not a batch
    aggregate receipt.
    """
    if not isinstance(batch_size, int) or isinstance(batch_size, bool) or batch_size <= 0:
        raise InputError(
            "batch_size must be a positive integer",
            details={"batch_size": batch_size, "type": type(batch_size).__name__},
        )
    if not isinstance(progress, bool):
        raise InputError(
            "progress must be bool",
            details={"type": type(progress).__name__},
        )
    # Normalize path-like values now so type errors surface at the API boundary.
    Path(vcf_path)
    Path(fasta_path)
    normalized_output = Path(output_path)
    normalized_receipt = None if receipt_path is None else Path(receipt_path)
    if normalized_receipt is not None and normalized_receipt == normalized_output:
        raise InputError("--receipt must differ from --output for VCF scoring")
    scorer = self._scorer
    with fail_closed_network_guard(), torch_inference_context():
        if scorer is not None:
            if normalized_receipt is None:
                score_surprise_vcf(
                    vcf_path,
                    scorer.encoder,
                    scorer.action_encoder,
                    scorer.predictor,
                    scorer.calibration,
                    normalized_output,
                    reference_fasta=fasta_path,
                    batch_size=batch_size,
                    show_progress=progress,
                )
            else:
                _write_vcf_scores_and_receipts(
                    backend=self.backend,
                    model_dir=self.model_dir,
                    manifest=self.manifest,
                    scorer=scorer,
                    vcf_path=vcf_path,
                    fasta_path=fasta_path,
                    output_path=normalized_output,
                    receipt_path=normalized_receipt,
                    batch_size=batch_size,
                )
            return
        _raise_backend_not_ready("score_vcf", self.backend, self.model_dir)

encode_window

encode_window(window: str, edit_locus: int | None = None) -> Any

Encode a DNA window once the encoder backend is installed.

Source code in geno_lewm/deploy/runtime.py
def encode_window(self, window: str, edit_locus: int | None = None) -> Any:
    """Encode a DNA window once the encoder backend is installed."""
    canonicalize_dna(window)
    if edit_locus is not None and (
        not isinstance(edit_locus, int) or isinstance(edit_locus, bool) or edit_locus < 0
    ):
        raise InputError(
            "edit_locus must be a non-negative integer or None",
            details={"edit_locus": edit_locus, "type": type(edit_locus).__name__},
        )
    with fail_closed_network_guard():
        _raise_backend_not_ready("encode_window", self.backend, self.model_dir)

predict

predict(state: Any, edits: Sequence[RelEdit]) -> Any

Run the predictor once a predictor backend is installed.

Source code in geno_lewm/deploy/runtime.py
def predict(self, state: Any, edits: Sequence[RelEdit]) -> Any:
    """Run the predictor once a predictor backend is installed."""
    if state is None:
        raise InputError("state must be non-None")
    if not isinstance(edits, Sequence):
        raise InputError(
            "edits must be a sequence of RelEdit values",
            details={"type": type(edits).__name__},
        )
    for idx, edit in enumerate(edits):
        if not isinstance(edit, RelEdit):
            raise InputError(
                "edits must contain RelEdit values",
                details={"index": idx, "type": type(edit).__name__},
            )
    with fail_closed_network_guard():
        _raise_backend_not_ready("predict", self.backend, self.model_dir)

CarbonStateEncoder

CarbonStateEncoder(model_id: str, revision: str, *, dtype: str = 'bf16', state_layer: int = -1, pool_type: str = POOL_CENTERED_MEAN, pool_radius: int = DEFAULT_POOL_RADIUS_TOKENS, normalize: bool = True, lora_config: object | None = None, model: object | None = None, tokenizer: object | None = None, encoder_hash: bytes | str | None = None, local_files_only: bool = True, trust_remote_code: bool = False, device: str | None = None)

Encode DNA windows with Carbon hidden states plus deterministic pooling.

Source code in geno_lewm/encoder/carbon.py
def __init__(
    self,
    model_id: str,
    revision: str,
    *,
    dtype: str = "bf16",
    state_layer: int = -1,
    pool_type: str = POOL_CENTERED_MEAN,
    pool_radius: int = DEFAULT_POOL_RADIUS_TOKENS,
    normalize: bool = True,
    lora_config: object | None = None,
    model: object | None = None,
    tokenizer: object | None = None,
    encoder_hash: bytes | str | None = None,
    local_files_only: bool = True,
    trust_remote_code: bool = False,
    device: str | None = None,
) -> None:
    if not model_id:
        raise InputError("model_id must be non-empty")
    if not revision:
        raise InputError("revision must be non-empty")
    if dtype not in _SUPPORTED_DTYPES:
        raise InputError(
            "unsupported encoder dtype",
            details={"dtype": dtype, "supported": sorted(_SUPPORTED_DTYPES)},
        )
    if not isinstance(state_layer, int) or isinstance(state_layer, bool):
        raise InputError(
            "state_layer must be an integer",
            details={"state_layer": state_layer, "type": type(state_layer).__name__},
        )
    if pool_type not in {POOL_CENTERED_MEAN, POOL_GLOBAL_MEAN}:
        raise InputError(
            "unsupported pool_type",
            details={
                "pool_type": pool_type,
                "supported": [POOL_CENTERED_MEAN, POOL_GLOBAL_MEAN],
            },
        )
    if not isinstance(pool_radius, int) or isinstance(pool_radius, bool) or pool_radius < 0:
        raise InputError(
            "pool_radius must be a non-negative integer",
            details={"pool_radius": pool_radius, "type": type(pool_radius).__name__},
        )
    if not isinstance(normalize, bool):
        raise InputError(
            "normalize must be bool",
            details={"type": type(normalize).__name__},
        )
    if lora_config is not None:
        raise RuntimeSetupError(
            "Carbon LoRA adapters are not supported by CarbonStateEncoder yet",
            remediation="merge LoRA adapters before loading or track the Phase 2 adapter issue",
        )
    if (model is None) != (tokenizer is None):
        raise InputError(
            "model and tokenizer must be supplied together",
            details={"model": model is not None, "tokenizer": tokenizer is not None},
        )

    self.model_id = model_id
    self.revision = revision
    self.dtype = dtype
    self.state_layer = state_layer
    self.pool_type = cast(_PoolType, pool_type)
    self.pool_radius = pool_radius
    self.normalize = normalize
    self.local_files_only = local_files_only
    self.trust_remote_code = trust_remote_code
    self.device = _resolve_device(device)
    self._encoder_hash = _coerce_encoder_hash(encoder_hash)
    self._d_state: int | None = None

    if model is None or tokenizer is None:
        tokenizer, model = _load_transformers_components(
            model_id=model_id,
            revision=revision,
            dtype=dtype,
            local_files_only=local_files_only,
            trust_remote_code=trust_remote_code,
        )
    self.tokenizer = tokenizer
    self.model = model
    _eval_if_available(self.model)
    _move_module_to_device(self.model, self.device)
    config = getattr(self.model, "config", None)
    hidden_size = getattr(config, "hidden_size", None)
    if isinstance(hidden_size, int) and not isinstance(hidden_size, bool) and hidden_size > 0:
        self._d_state = hidden_size

encoder_hash property

encoder_hash: bytes

Return the configured encoder hash bytes.

d_state property

d_state: int

Return the pooled state width when known.

encode

encode(window: str, edit_locus: int | None = None) -> tuple[float, ...]

Encode and pool one DNA window.

Source code in geno_lewm/encoder/carbon.py
def encode(self, window: str, edit_locus: int | None = None) -> tuple[float, ...]:
    """Encode and pool one DNA window."""
    return self.encode_batch([window], [edit_locus])[0]

encode_batch

encode_batch(windows: Sequence[str], edit_loci: Sequence[int | None]) -> tuple[tuple[float, ...], ...]

Encode and pool a batch of DNA windows.

Source code in geno_lewm/encoder/carbon.py
def encode_batch(
    self,
    windows: Sequence[str],
    edit_loci: Sequence[int | None],
) -> tuple[tuple[float, ...], ...]:
    """Encode and pool a batch of DNA windows."""
    if not isinstance(windows, Sequence) or isinstance(windows, str | bytes):
        raise InputError(
            "windows must be a sequence of DNA strings",
            details={"type": type(windows).__name__},
        )
    if not isinstance(edit_loci, Sequence) or isinstance(edit_loci, str | bytes):
        raise InputError(
            "edit_loci must be a sequence of int or None values",
            details={"type": type(edit_loci).__name__},
        )
    if len(windows) != len(edit_loci):
        raise InputError(
            "windows and edit_loci must have the same length",
            details={"windows": len(windows), "edit_loci": len(edit_loci)},
        )
    if not windows:
        raise InputError("windows must contain at least one sequence")

    normalized = tuple(canonicalize_dna(window) for window in windows)
    wrapped = [wrap_dna_for_tokenizer(window) for window in normalized]
    tokenized = _tokenize(self.tokenizer, wrapped)
    tokenized = _move_inputs_to_device(tokenized, self.device)
    with torch_inference_context():
        output = _call_model(self.model, tokenized)
    rows_by_item = _hidden_rows_by_item(output, state_layer=self.state_layer)
    if len(rows_by_item) != len(windows):
        raise InputError(
            "encoder output batch size does not match input windows",
            details={"expected": len(windows), "observed": len(rows_by_item)},
        )

    encoded = tuple(
        pool_hidden_states(
            rows,
            edit_locus=edit_locus,
            pool_type=self.pool_type,
            pool_radius=self.pool_radius,
        ).vector
        for rows, edit_locus in zip(rows_by_item, edit_loci, strict=True)
    )
    if encoded:
        self._d_state = len(encoded[0])
    return encoded

BackendUnsupportedError

BackendUnsupportedError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: DeployError

Requested runtime backend is not available on the host.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

CacheCorruptError

CacheCorruptError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: ResourceError

A cache shard failed an integrity check.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

CollapseDetectedError

CollapseDetectedError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: TrainingError

A representation-collapse alert tripped (RFC-0005).

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

ConfigError

ConfigError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: GenoLeWMError

Configuration is malformed or incompatible.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

DataLoaderError

DataLoaderError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: TrainingError

Data pipeline raised an exception the trainer cannot recover from.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

DeployError

DeployError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: GenoLeWMError

Export or runtime backend failure.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

DiskFullError

DiskFullError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: ResourceError

Storage was exhausted during a write.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

ErrorCodeEntry

ErrorCodeEntry(code: str, exception_class: type[GenoLeWMError], summary: str)

A single immutable row in the error-code registry.

Source code in geno_lewm/errors.py
def __init__(self, code: str, exception_class: type[GenoLeWMError], summary: str) -> None:
    self.code = code
    self.exception_class = exception_class
    self.summary = summary

EvalDatasetError

EvalDatasetError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: EvalError

A benchmark file or dataset could not be loaded.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

EvalError

EvalError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: GenoLeWMError

Evaluation harness failure.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

EvalRegressionError

EvalRegressionError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: EvalError

A smoke-eval gate threshold was breached.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

ExportFormatError

ExportFormatError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: DeployError

Conversion to ONNX, Core ML, or GGUF failed.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

GenoLeWMError

GenoLeWMError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: Exception

Root of the GenoLeWM exception hierarchy.

Every typed exception in this package inherits from GenoLeWMError. Subclasses must set a code class attribute that matches an entry in :data:ERROR_CODES.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

to_dict

to_dict() -> dict[str, Any]

Return the structured payload as a plain dict.

The output is JSON-serializable provided details values are JSON-native. Keys mirror the structured-log error event so the CLI dispatcher and observability layer can pass it through without translation.

Source code in geno_lewm/errors.py
def to_dict(self) -> dict[str, Any]:
    """Return the structured payload as a plain dict.

    The output is JSON-serializable provided ``details`` values are
    JSON-native. Keys mirror the structured-log ``error`` event so
    the CLI dispatcher and observability layer can pass it through
    without translation.
    """
    return {
        "code": self.code,
        "message": self.message,
        "details": self.details,
        "remediation": self.remediation,
    }

to_json

to_json() -> str

Return the structured payload as a JSON string.

Adds a UTC ts field so log sinks receive a self-contained record.

Source code in geno_lewm/errors.py
def to_json(self) -> str:
    """Return the structured payload as a JSON string.

    Adds a UTC ``ts`` field so log sinks receive a self-contained
    record.
    """
    payload = self.to_dict()
    payload["ts"] = datetime.now(tz=timezone.utc).isoformat()
    return json.dumps(payload, sort_keys=True, default=str)

InputCommitmentMismatchError

InputCommitmentMismatchError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: ProvenanceError

Recomputed input commitment does not match the receipt.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

InputError

InputError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: GenoLeWMError

Caller-supplied input violates a documented invariant.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

InternalError

InternalError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: GenoLeWMError

A bug we caught; should never surface to end users.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

InvalidEditError

InvalidEditError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: InputError

An EditSpec fails one of its constructor invariants.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

InvariantViolation

InvariantViolation(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: InternalError

A runtime invariant marked INV-* was breached.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

ManifestHashMismatchError

ManifestHashMismatchError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: ProvenanceError

Recomputed manifest hash does not match the stated model_id.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

MissingConfigError

MissingConfigError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: ConfigError

A required configuration field is absent.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

ModelNotFoundError

ModelNotFoundError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: ResourceError

A model checkpoint is missing locally and cannot be downloaded.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

NaNLossError

NaNLossError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: TrainingError

Loss became NaN or Inf.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

NetworkCallProhibitedError

NetworkCallProhibitedError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: ResourceError

A post-setup network call was attempted under fail-closed policy.

See RFC-0010 §3.7 ("on-device fail-closed").

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

OutOfMemoryError

OutOfMemoryError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: ResourceError

Re-raise of CUDA / host OOM with attached GenoLeWM context.

Intentionally shadows the builtin OutOfMemoryError only inside this module's namespace; downstream code that needs the builtin can use builtins.OutOfMemoryError.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

OutOfWindowError

OutOfWindowError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: InputError

An edit's rel_pos falls outside the encoder window.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

OutputCommitmentMismatchError

OutputCommitmentMismatchError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: ProvenanceError

Recomputed output bytes do not match the receipt.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

OverlappingEditsError

OverlappingEditsError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: InputError

Two or more edits in a haplotype overlap in genomic coordinates.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

ProvenanceError

ProvenanceError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: GenoLeWMError

Receipt or artifact-provenance failure (RFC-0011).

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

ProvenanceKindUnsupportedError

ProvenanceKindUnsupportedError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: ProvenanceError

Verifier does not understand the receipt provenance kind.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

QuantizationError

QuantizationError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: DeployError

int8 or int4 calibration failed.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

ReceiptSchemaError

ReceiptSchemaError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: ProvenanceError

Receipt JSON failed schema validation.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

ResourceError

ResourceError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: GenoLeWMError

Capacity, IO, or network failure.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

RuntimeSetupError

RuntimeSetupError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: ResourceError

First-run network setup step (download / verification) failed.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

SchemaCompatError

SchemaCompatError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: ConfigError

An on-disk artifact's schema MAJOR version is incompatible.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

TrainingError

TrainingError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: GenoLeWMError

Training-loop-specific failure.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

UnknownTopLevelKeyError

UnknownTopLevelKeyError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: ConfigError

A configuration payload contained a top-level key not in the schema.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

UnreachableError

UnreachableError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: InternalError

Control flow reached a branch marked unreachable.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

UnsupportedEditError

UnsupportedEditError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: InputError

An edit's type or length is outside the v1 scope (RFC-0003).

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

VcfParseError

VcfParseError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: InputError

A VCF or FASTA input is malformed.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

WindowMismatchError

WindowMismatchError(message: str = '', *, details: Mapping[str, Any] | None = None, remediation: str | None = None)

Bases: InputError

Window reference bases do not match EditSpec.ref at rel_pos.

Source code in geno_lewm/errors.py
def __init__(
    self,
    message: str = "",
    *,
    details: Mapping[str, Any] | None = None,
    remediation: str | None = None,
) -> None:
    super().__init__(message)
    self.message = message
    self.details: dict[str, Any] = dict(details) if details else {}
    self.remediation = remediation

EventSpec dataclass

EventSpec(name: str, severity: Severity, summary: str, allowed_keys: frozenset[str] = frozenset())

A single row in the :data:EVENTS registry.

allowed_keys lists the data keys that the per-event redaction allowlist permits (RFC-0013 §3.5). Standardized fields (step, epoch, phase, duration_ms, trace_id, span_id, error_code) are promoted out of data before redaction and are always allowed at the top level — they need not appear here.

GenoLeWMLogger

GenoLeWMLogger(component: str, *, run_id: str, log_dir: Path, sink: _Sink, level: Severity = 'info', pretty: bool = False)

Component-scoped structured logger.

Loggers are cheap to construct (cached by (component, run_id)) and thread-safe: the underlying sink serializes writes.

Source code in geno_lewm/observability.py
def __init__(
    self,
    component: str,
    *,
    run_id: str,
    log_dir: Path,
    sink: _Sink,
    level: Severity = "info",
    pretty: bool = False,
) -> None:
    self.component = component
    self.run_id = run_id
    self.log_dir = log_dir
    self._sink = sink
    self._level = level
    self._pretty = pretty

LogRecord dataclass

LogRecord(ts: str, severity: Severity, event: str, run_id: str, component: str, data: dict[str, Any] = dict(), step: int | None = None, epoch: int | None = None, phase: str | None = None, duration_ms: int | None = None, trace_id: str | None = None, span_id: str | None = None, error_code: str | None = None)

One row written by the logger.

The record carries the spec-required fields directly and stashes event-specific structured fields under :attr:data. to_dict returns the exact wire shape — keys are stable across versions.

DtypeConfig dataclass

DtypeConfig(encoder_dtype: str, predictor_dtype: str)

Numerical-precision commit shape.

Manifest dataclass

Manifest(schema_version: str, model_name: str, model_version: str, release_id: str, encoder: ManifestEncoder, predictor: ManifestArtifact, action_encoder: ManifestArtifact, calibration: ManifestArtifact, training: ManifestTraining, eval: ManifestArtifact)

Top-level manifest (RFC-0011 §3.7).

to_canonical_dict

to_canonical_dict() -> dict[str, Any]

Return the dict mirror used for canonical JSON.

Dataclass asdict walks nested frozen dataclasses and is deterministic, so the result is byte-stable when fed to the canonical JSON encoder (which also sorts keys).

Source code in geno_lewm/provenance/manifest.py
def to_canonical_dict(self) -> dict[str, Any]:
    """Return the dict mirror used for canonical JSON.

    Dataclass ``asdict`` walks nested frozen dataclasses and is
    deterministic, so the result is byte-stable when fed to the
    canonical JSON encoder (which also sorts keys).
    """
    return asdict(self)

model_id

model_id() -> str

Return model_id = SHA-256(canonical_json(manifest)).

Source code in geno_lewm/provenance/manifest.py
def model_id(self) -> str:
    """Return ``model_id = SHA-256(canonical_json(manifest))``."""
    return canonical_json_sha256(self.to_canonical_dict())

ManifestArtifact dataclass

ManifestArtifact(file: str, hash: str, dtype: str | None = None, version: str | None = None)

A single artifact file referenced by the manifest.

ManifestEncoder dataclass

ManifestEncoder(id: str, revision: str, hash: str)

Carbon encoder identity.

ManifestTraining dataclass

ManifestTraining(config_file: str, hash: str, data_snapshot: dict[str, str] = dict())

Training config + data-snapshot identifiers.

PoolingConfig dataclass

PoolingConfig(state_layer: int, pool_type: str, pool_radius: int, normalize: bool)

State-encoder pooling configuration commit shape (RFC-0002).

Receipt dataclass

Receipt(schema_version: str, model_id: str, input_commitment: str, output: ReceiptOutput, output_commitment: str, calibration_hash: str, runtime: ReceiptRuntime, timestamp: str, provenance: ReceiptProvenance)

Top-level receipt (RFC-0011 §3.3).

ReceiptOutput dataclass

ReceiptOutput(sigma_raw: float, sigma_calibrated: float, bucket_id: str, confidence: float, low_confidence: bool)

Score-call output committed by the receipt.

ReceiptProvenance dataclass

ReceiptProvenance(kind: str, details: dict[str, Any] | None = None)

Checksum provenance block serialized as provenance in v1 JSON.

ReceiptRuntime dataclass

ReceiptRuntime(backend: str, device: str, geno_lewm_version: str, carbon_revision: str)

Runtime / environment block.

SurpriseResult dataclass

SurpriseResult(sigma_raw: float, sigma_calibrated: float, bucket_id: str, confidence: float, low_confidence: bool)

Calibrated surprise score for one edit.

to_dict

to_dict() -> dict[str, float | str | bool]

Return a JSON-native payload for CLI and JSONL outputs.

Source code in geno_lewm/surprise/score.py
def to_dict(self) -> dict[str, float | str | bool]:
    """Return a JSON-native payload for CLI and JSONL outputs."""
    return {
        "sigma_raw": self.sigma_raw,
        "sigma_calibrated": self.sigma_calibrated,
        "bucket_id": self.bucket_id,
        "confidence": self.confidence,
        "low_confidence": self.low_confidence,
    }

apply_edit

apply_edit(window: str, edit: RelEdit, *, preserve_length: bool = False) -> str

Return window with edit applied.

window is the pre-edit base string (uppercase ACGTN). The function does not validate window contents beyond what the edit locus requires; that is the caller's responsibility.

The reference bases at the edit locus must match edit.ref_bases case-insensitively — otherwise :class:WindowMismatchError is raised with the locus context attached.

Pass preserve_length=True to truncate / pad the result back to the original window length on the side opposite the edit. The default leaves the indel length change intact (length-preserving is the trainer's responsibility for s_{t+1} encoding).

Source code in geno_lewm/action/apply.py
def apply_edit(window: str, edit: RelEdit, *, preserve_length: bool = False) -> str:
    """Return ``window`` with ``edit`` applied.

    ``window`` is the pre-edit base string (uppercase ACGTN). The
    function does not validate window contents beyond what the edit
    locus requires; that is the caller's responsibility.

    The reference bases at the edit locus must match ``edit.ref_bases``
    case-insensitively — otherwise :class:`WindowMismatchError` is
    raised with the locus context attached.

    Pass ``preserve_length=True`` to truncate / pad the result back to
    the original window length on the side opposite the edit. The
    default leaves the indel length change intact (length-preserving
    is the trainer's responsibility for ``s_{t+1}`` encoding).
    """
    original_len = len(window)
    end = edit.rel_pos + len(edit.ref_bases)
    if edit.rel_pos < 0 or end > original_len:
        raise OutOfWindowError(
            "edit locus is outside the window",
            details={
                "rel_pos": edit.rel_pos,
                "ref_len": len(edit.ref_bases),
                "window_len": original_len,
            },
        )

    observed = window[edit.rel_pos : end]
    if observed.upper() != edit.ref_bases.upper():
        raise WindowMismatchError(
            "window bases do not match edit.ref_bases at locus",
            details={
                "rel_pos": edit.rel_pos,
                "expected_ref": edit.ref_bases,
                "observed_ref": observed,
            },
            remediation="re-fetch the window, or correct the EditSpec.ref",
        )

    edited = window[: edit.rel_pos] + edit.alt_bases + window[end:]

    if not preserve_length:
        return edited

    return _truncate_or_pad(edited, original_len, edit_locus=edit.rel_pos)

apply_edits

apply_edits(window: str, edits: Sequence[RelEdit], *, preserve_length: bool = False) -> str

Apply a sequence of edits to window.

The edits are sorted by descending rel_pos and applied in that order (INV-ARCH-4). Edits must not overlap in genomic coordinates; overlap raises :class:OverlappingEditsError.

Equivalent inputs (same set of edits in any caller-supplied order) produce equivalent outputs — the function is order-invariant after the internal sort, which is the property the training pipeline relies on.

The preserve_length flag truncates / pads back to the input window length using the position of the first (left-most) edit as the reference locus, so the side opposite the edit cluster is the one trimmed.

Source code in geno_lewm/action/apply.py
def apply_edits(
    window: str,
    edits: Sequence[RelEdit],
    *,
    preserve_length: bool = False,
) -> str:
    """Apply a sequence of edits to ``window``.

    The edits are sorted by descending ``rel_pos`` and applied in that
    order (INV-ARCH-4). Edits must not overlap in genomic coordinates;
    overlap raises :class:`OverlappingEditsError`.

    Equivalent inputs (same set of edits in any caller-supplied order)
    produce equivalent outputs — the function is order-invariant after
    the internal sort, which is the property the training pipeline
    relies on.

    The ``preserve_length`` flag truncates / pads back to the input
    window length using the position of the **first** (left-most)
    edit as the reference locus, so the side opposite the edit cluster
    is the one trimmed.
    """
    if not edits:
        return window

    _assert_disjoint(edits)

    # Apply right-to-left. With preserve_length=False on the inner
    # calls so we only truncate once at the end (intermediate lengths
    # change with indels, which is fine).
    ordered = sorted(edits, key=lambda e: e.rel_pos, reverse=True)
    out = window
    for edit in ordered:
        out = apply_edit(out, edit, preserve_length=False)

    if not preserve_length:
        return out

    leftmost = min(e.rel_pos for e in edits)
    return _truncate_or_pad(out, len(window), edit_locus=leftmost)

indel

indel(window: str, n: int, *, rng: Random, length_dist: Mapping[int, float] | Sequence[float] | None = None, type_mix: tuple[float, float] = (0.5, 0.5), edge_margin: int = DEFAULT_EDGE_MARGIN) -> list[RelEdit]

Sample n indels (INS or DEL).

length_dist is the event length (number of bases inserted or deleted, exclusive of the VCF anchor base). Default is a truncated geometric over [1, V1_MAX_LEN-1].

type_mix is (p_ins, p_del). Default 50/50.

Source code in geno_lewm/action/synthetic.py
def indel(
    window: str,
    n: int,
    *,
    rng: random.Random,
    length_dist: Mapping[int, float] | Sequence[float] | None = None,
    type_mix: tuple[float, float] = (0.5, 0.5),
    edge_margin: int = DEFAULT_EDGE_MARGIN,
) -> list[RelEdit]:
    """Sample ``n`` indels (INS or DEL).

    ``length_dist`` is the *event* length (number of bases inserted or
    deleted, exclusive of the VCF anchor base). Default is a truncated
    geometric over ``[1, V1_MAX_LEN-1]``.

    ``type_mix`` is ``(p_ins, p_del)``. Default 50/50.
    """
    _validate_window(window, edge_margin)
    if n < 0:
        raise InputError("n must be non-negative", details={"n": n})
    if any(p < 0 for p in type_mix) or sum(type_mix) <= 0:
        raise InputError(
            "type_mix must contain non-negative probs that sum > 0",
            details={"type_mix": list(type_mix)},
        )

    p_ins = type_mix[0] / sum(type_mix)

    out: list[RelEdit] = []
    # Each requested indel resamples on a non-ACGT anchor or an N-containing
    # deletion segment so the sampler reliably returns ``n`` edits on windows
    # with occasional N bases (e.g. the Carbon pretraining corpus), matching
    # uniform_snv. Without this, a single N hit dropped a slot and returned
    # fewer than ``n`` edits, which the data builder treats as a hard error for
    # sources (synthetic_indel) that have no fallback. Bound the total attempts
    # so a pathological all-N window fails loudly instead of looping forever.
    # On all-ACGT windows every attempt succeeds first try, so the draw sequence
    # (and output) is identical to a plain ``for _ in range(n)`` loop.
    max_attempts = n * 16 + 16
    attempts = 0
    while len(out) < n and attempts < max_attempts:
        attempts += 1
        pos = _pick_position(rng, len(window), edge_margin)
        ref_anchor = window[pos]
        if ref_anchor not in _OTHER_BASE:
            continue  # non-ACGT anchor; resample
        # Event length in [1, V1_MAX_LEN-1] so total ref or alt length ≤ V1_MAX_LEN.
        # We respect the caller's distribution but clip to V1_MAX_LEN-1.
        ev_len = min(_draw_indel_length(rng, length_dist), V1_MAX_LEN - 1)

        if rng.random() < p_ins:
            # Insertion: ref = anchor, alt = anchor + ev_len random bases.
            inserted = _rand_bases(rng, ev_len)
            out.append(
                RelEdit(
                    rel_pos=pos,
                    edit_type=EditType.INS,
                    ref_bases=ref_anchor,
                    alt_bases=ref_anchor + inserted,
                )
            )
            continue

        # Deletion: ref = anchor + ev_len following bases, alt = anchor.
        end = pos + 1 + ev_len
        if end > len(window) - edge_margin:
            # Cannot fit deletion without crossing right margin; emit INS instead.
            inserted = _rand_bases(rng, ev_len)
            out.append(
                RelEdit(
                    rel_pos=pos,
                    edit_type=EditType.INS,
                    ref_bases=ref_anchor,
                    alt_bases=ref_anchor + inserted,
                )
            )
            continue
        ref_seg = window[pos:end]
        # Resample when the ref segment contains N's (cannot build a valid RelEdit).
        if any(c not in _OTHER_BASE for c in ref_seg):
            continue
        out.append(
            RelEdit(
                rel_pos=pos,
                edit_type=EditType.DEL,
                ref_bases=ref_seg,
                alt_bases=ref_anchor,
            )
        )
    if len(out) < n:
        raise InputError(
            "could not sample enough indels in the window's interior (too many N bases)",
            details={
                "requested": n,
                "produced": len(out),
                "window_len": len(window),
                "edge_margin": edge_margin,
            },
        )
    return out

mnv

mnv(window: str, n: int, *, rng: Random, length_dist: Mapping[int, float] | Sequence[float] | None = None, edge_margin: int = DEFAULT_EDGE_MARGIN) -> list[RelEdit]

Sample n MNVs (length-preserving multi-base substitutions).

Length is drawn from length_dist (default uniform over [2, 8] per RFC text). The alt is guaranteed different from ref at every base (otherwise constructing a RelEdit with that ref/alt would be rejected by EditSpec validation).

Source code in geno_lewm/action/synthetic.py
def mnv(
    window: str,
    n: int,
    *,
    rng: random.Random,
    length_dist: Mapping[int, float] | Sequence[float] | None = None,
    edge_margin: int = DEFAULT_EDGE_MARGIN,
) -> list[RelEdit]:
    """Sample ``n`` MNVs (length-preserving multi-base substitutions).

    Length is drawn from ``length_dist`` (default uniform over [2, 8]
    per RFC text). The alt is guaranteed different from ref at every
    base (otherwise constructing a RelEdit with that ref/alt would be
    rejected by EditSpec validation).
    """
    _validate_window(window, edge_margin)
    if n < 0:
        raise InputError("n must be non-negative", details={"n": n})

    if length_dist is None:
        length_dist = dict.fromkeys(range(2, 9), 1.0)  # uniform on [2, 8]

    out: list[RelEdit] = []
    for _ in range(n):
        pos = _pick_position(rng, len(window), edge_margin)
        length = max(2, min(_draw_indel_length(rng, length_dist), V1_MAX_LEN))
        end = pos + length
        if end > len(window) - edge_margin:
            continue
        ref_seg = window[pos:end]
        if any(c not in _OTHER_BASE for c in ref_seg):
            continue
        # Build alt by perturbing every base to a non-self draw.
        alt_chars = [rng.choice(_OTHER_BASE[c]) for c in ref_seg]
        alt_seg = "".join(alt_chars)
        if alt_seg == ref_seg:
            continue  # extremely unlikely; skip
        out.append(
            RelEdit(
                rel_pos=pos,
                edit_type=EditType.MNV,
                ref_bases=ref_seg,
                alt_bases=alt_seg,
            )
        )
    return out

uniform_snv

uniform_snv(window: str, n: int, *, rng: Random, edge_margin: int = DEFAULT_EDGE_MARGIN) -> list[RelEdit]

Sample n uniform SNVs anchored inside window.

Each SNV's alt is uniformly drawn from the three non-reference bases at the chosen position, so the contract "alt is always non-reference" is enforced by construction.

Returns edits in the order they were sampled. The list may contain duplicates by position — the caller (data pipeline) is responsible for deduplication if it needs disjoint edits.

Source code in geno_lewm/action/synthetic.py
def uniform_snv(
    window: str,
    n: int,
    *,
    rng: random.Random,
    edge_margin: int = DEFAULT_EDGE_MARGIN,
) -> list[RelEdit]:
    """Sample ``n`` uniform SNVs anchored inside ``window``.

    Each SNV's ``alt`` is uniformly drawn from the three non-reference
    bases at the chosen position, so the contract "alt is always
    non-reference" is enforced by construction.

    Returns edits in the order they were sampled. The list may contain
    duplicates by position — the caller (data pipeline) is responsible
    for deduplication if it needs disjoint edits.
    """
    _validate_window(window, edge_margin)
    if n < 0:
        raise InputError("n must be non-negative", details={"n": n})

    out: list[RelEdit] = []
    for _ in range(n):
        pos = _pick_position(rng, len(window), edge_margin)
        ref = window[pos]
        if ref not in _OTHER_BASE:
            # Window contains 'N' or other non-ACGT at this position; resample.
            # Simple bounded retry; if window is mostly N's the caller
            # should not be using a synthetic sampler.
            for _retry in range(10):
                pos = _pick_position(rng, len(window), edge_margin)
                ref = window[pos]
                if ref in _OTHER_BASE:
                    break
            else:  # pragma: no cover - defensive
                raise InputError(
                    "could not find an ACGT position in the window's interior",
                    details={"window_len": len(window), "edge_margin": edge_margin},
                )
        alt = rng.choice(_OTHER_BASE[ref])
        out.append(RelEdit(rel_pos=pos, edit_type=EditType.SNV, ref_bases=ref, alt_bases=alt))
    return out

deprecated

deprecated(reason: str = '') -> Callable[[F], F]

Mark obj as deprecated.

Usage::

@deprecated("use new_thing() instead; removed in v0.3")
def old_thing(...): ...

Emits :class:DeprecationWarning once per process per call site — that is, once per (filename, lineno) of the calling code. Multiple call sites all warn; the same site warns only once.

reason is appended to the warning message; pass a short actionable string ("use X instead").

Source code in geno_lewm/api.py
def deprecated(reason: str = "") -> Callable[[F], F]:
    """Mark ``obj`` as deprecated.

    Usage::

        @deprecated("use new_thing() instead; removed in v0.3")
        def old_thing(...): ...

    Emits :class:`DeprecationWarning` once per process **per call
    site** — that is, once per ``(filename, lineno)`` of the calling
    code. Multiple call sites all warn; the same site warns only once.

    ``reason`` is appended to the warning message; pass a short
    actionable string ("use X instead").
    """
    if not isinstance(reason, str):
        from geno_lewm.errors import InputError  # type: ignore[unreachable]

        raise InputError(
            "deprecated(reason) must be str",
            details={"type": type(reason).__name__},
        )

    def _decorate(target: F) -> F:
        base_msg = f"{target.__module__}.{target.__qualname__} is deprecated."
        if reason:
            base_msg += f" {reason}"

        is_class = inspect.isclass(target)
        if is_class:
            original_init: Callable[..., None] = target.__init__

            @functools.wraps(original_init)
            def __init__(self: Any, *args: Any, **kwargs: Any) -> None:  # noqa: N807
                file, line = _caller_site(depth=2)
                _emit_once(
                    (id(target), file, line),
                    DeprecationWarning,
                    base_msg,
                    stacklevel=3,
                )
                original_init(self, *args, **kwargs)

            setattr(target, "__init__", __init__)  # noqa: B010
            target.__geno_lewm_deprecated__ = True  # type: ignore[attr-defined]
            return target

        @functools.wraps(target)
        def _wrap(*args: Any, **kwargs: Any) -> Any:
            file, line = _caller_site(depth=2)
            _emit_once(
                (id(target), file, line),
                DeprecationWarning,
                base_msg,
                stacklevel=3,
            )
            return target(*args, **kwargs)

        _wrap.__geno_lewm_deprecated__ = True  # type: ignore[attr-defined]
        return cast(F, _wrap)

    return _decorate

experimental

experimental(obj: F) -> F
experimental(*, reason: str = '') -> Callable[[F], F]
experimental(obj: Any = None, *, reason: str = '') -> Any

Mark obj (a function or class) as experimental.

Usage::

@experimental
def f(...): ...

@experimental(reason="API shape under review")
class C: ...

Emits :class:FutureWarning once per process when the decorated object is first invoked (function) or instantiated (class). Later calls are silent.

Source code in geno_lewm/api.py
def experimental(obj: Any = None, *, reason: str = "") -> Any:
    """Mark ``obj`` (a function or class) as experimental.

    Usage::

        @experimental
        def f(...): ...

        @experimental(reason="API shape under review")
        class C: ...

    Emits :class:`FutureWarning` once per process when the decorated
    object is first invoked (function) or instantiated (class). Later
    calls are silent.
    """

    def _decorate(target: Any) -> Any:
        msg = f"{target.__module__}.{target.__qualname__} is experimental and may change without notice."
        if reason:
            msg += f" {reason}"

        if inspect.isclass(target):
            original_init = target.__init__
            sentinel = id(target)

            @functools.wraps(original_init)
            def __init__(self: Any, *args: Any, **kwargs: Any) -> None:  # noqa: N807
                _emit_once(sentinel, FutureWarning, msg, stacklevel=2)
                original_init(self, *args, **kwargs)

            setattr(target, "__init__", __init__)  # noqa: B010
            target.__geno_lewm_experimental__ = True
            return target

        sentinel_func = id(target)

        @functools.wraps(target)
        def _wrap(*args: Any, **kwargs: Any) -> Any:
            _emit_once(sentinel_func, FutureWarning, msg, stacklevel=2)
            return target(*args, **kwargs)

        _wrap.__geno_lewm_experimental__ = True  # type: ignore[attr-defined]
        return _wrap

    # Distinguish bare ``@experimental`` from parameterised ``@experimental(reason=…)``.
    if obj is not None and callable(obj):
        return _decorate(obj)
    return _decorate

fail_closed_network_guard

fail_closed_network_guard() -> Iterator[None]

Block common network entry points inside an inference path.

Source code in geno_lewm/deploy/runtime.py
@contextlib.contextmanager
def fail_closed_network_guard() -> Iterator[None]:
    """Block common network entry points inside an inference path."""

    def _blocked(*_args: Any, **_kwargs: Any) -> NoReturn:
        raise NetworkCallProhibitedError(
            "runtime network call attempted after setup",
            remediation="perform downloads only through explicit setup/update commands",
        )

    with contextlib.ExitStack() as stack:
        for target in (
            "socket.create_connection",
            "socket.socket.connect",
            "urllib.request.urlopen",
            "http.client.HTTPConnection.connect",
            "http.client.HTTPSConnection.connect",
        ):
            stack.enter_context(patch(target, _blocked))
        yield

probe_backends

probe_backends(model_dir: str | Path | None = None) -> tuple[BackendProbe, ...]

Probe runtime backends in RFC-0010 auto-selection order.

Source code in geno_lewm/deploy/runtime.py
def probe_backends(model_dir: str | Path | None = None) -> tuple[BackendProbe, ...]:
    """Probe runtime backends in RFC-0010 auto-selection order."""
    root = None if model_dir is None else Path(model_dir).expanduser()
    return (
        _probe_coreml(root),
        _probe_cuda(root),
        _probe_onnx(root),
        BackendProbe(BACKEND_CPU, True, "portable CPU fallback is always available"),
    )

select_backend

select_backend(backend: str = BACKEND_AUTO, *, probes: Sequence[BackendProbe] | None = None) -> str

Select a backend from probe results, or raise if the requested one is unavailable.

Source code in geno_lewm/deploy/runtime.py
def select_backend(
    backend: str = BACKEND_AUTO,
    *,
    probes: Sequence[BackendProbe] | None = None,
) -> str:
    """Select a backend from probe results, or raise if the requested one is unavailable."""
    normalized = _normalize_backend(backend)
    observed = tuple(probe_backends() if probes is None else probes)
    by_backend = {probe.backend: probe for probe in observed}

    if normalized == BACKEND_AUTO:
        for name in BACKEND_PRIORITY:
            probe = by_backend.get(name)
            if probe is not None and probe.available:
                return name
        raise BackendUnsupportedError(
            "no runtime backend is available",
            details={"probes": [_probe_details(probe) for probe in observed]},
        )

    probe = by_backend.get(normalized)
    if probe is None:
        raise BackendUnsupportedError(
            "requested runtime backend was not probed",
            details={"backend": normalized, "probed": sorted(by_backend)},
        )
    if not probe.available:
        raise BackendUnsupportedError(
            "requested runtime backend is unavailable",
            details={"backend": normalized, "reason": probe.reason},
        )
    return normalized

exit_code_for

exit_code_for(exc: BaseException) -> int

Return the CLI exit code for exc.

Used by geno_lewm.cli._dispatch. Non-GenoLeWM exceptions map to exit code 1 ("uncategorized failure"); KeyboardInterrupt maps to 130.

Source code in geno_lewm/errors.py
def exit_code_for(exc: BaseException) -> int:
    """Return the CLI exit code for ``exc``.

    Used by ``geno_lewm.cli._dispatch``. Non-GenoLeWM exceptions map to
    exit code 1 ("uncategorized failure"); ``KeyboardInterrupt`` maps to
    130.
    """
    if isinstance(exc, KeyboardInterrupt):
        return 130
    if not isinstance(exc, GenoLeWMError):
        return 1
    for family, code in _EXIT_CODE_BY_FAMILY:
        if isinstance(exc, family):
            return code
    return 1

current_trace_context

current_trace_context() -> tuple[str | None, str | None]

Return (trace_id, span_id) from the current context.

Source code in geno_lewm/observability.py
def current_trace_context() -> tuple[str | None, str | None]:
    """Return ``(trace_id, span_id)`` from the current context."""
    return _TRACE_ID.get(), _SPAN_ID.get()

get_logger

get_logger(component: str, *, run_id: str | None = None, log_dir: str | PathLike[str] | None = None, level: Severity | None = None, pretty: bool | None = None) -> GenoLeWMLogger

Return a logger bound to component.

Loggers are cached by (component, run_id, log_dir); calling get_logger twice with the same arguments returns the same instance, so independent subsystems share one ordered stream per run.

Defaults:

  • run_id: $GENO_LEWM_RUN_ID or a random run-<hex>.
  • log_dir: $GENO_LEWM_LOG_DIR or ~/.geno-lewm/logs.
  • level: $GENO_LEWM_LOG_LEVEL (default info).
  • pretty: TTY-detected, overridable by $GENO_LEWM_LOG_FORMAT.
Source code in geno_lewm/observability.py
def get_logger(
    component: str,
    *,
    run_id: str | None = None,
    log_dir: str | os.PathLike[str] | None = None,
    level: Severity | None = None,
    pretty: bool | None = None,
) -> GenoLeWMLogger:
    """Return a logger bound to ``component``.

    Loggers are cached by ``(component, run_id, log_dir)``; calling
    ``get_logger`` twice with the same arguments returns the same
    instance, so independent subsystems share one ordered stream per
    run.

    Defaults:

    - ``run_id``: ``$GENO_LEWM_RUN_ID`` or a random ``run-<hex>``.
    - ``log_dir``: ``$GENO_LEWM_LOG_DIR`` or ``~/.geno-lewm/logs``.
    - ``level``: ``$GENO_LEWM_LOG_LEVEL`` (default ``info``).
    - ``pretty``: TTY-detected, overridable by ``$GENO_LEWM_LOG_FORMAT``.
    """
    rid = run_id or os.environ.get("GENO_LEWM_RUN_ID") or _new_run_id()
    ldir = _resolve_log_dir(log_dir)
    sink = _open_sink(rid, ldir)
    lvl: Severity = level if level is not None else _env_level()
    pp = pretty if pretty is not None else _env_pretty()
    project = _resolve_wandb_project(None)
    if project is not None:
        _ensure_wandb_sink(run_id=rid, project=project)

    key = (component, rid, str(ldir.resolve()))
    with _LOGGERS_LOCK:
        existing = _LOGGERS.get(key)
        if existing is not None:
            return existing
        logger = GenoLeWMLogger(
            component=component,
            run_id=rid,
            log_dir=ldir,
            sink=sink,
            level=lvl,
            pretty=pp,
        )
        _LOGGERS[key] = logger
        return logger

logged_run

logged_run(component: str = 'runtime', *, run_id: str | None = None, log_dir: str | PathLike[str] | None = None, start_event: str | None = None, end_event: str | None = None, start_data: Mapping[str, Any] | None = None) -> Iterator[GenoLeWMLogger]

Open a sink for the run; flush on exit; never swallow exceptions.

The wrapper guarantees that any records emitted up to a crash are flushed to disk (INV-OBS-6: "a crash before logger init still produces a sanitized minimal record"). Optional start_event / end_event book-end the run. If the block raises and the exception is a geno_lewm.errors.GenoLeWMError, an error record is emitted before the exception propagates.

Source code in geno_lewm/observability.py
@contextlib.contextmanager
def logged_run(
    component: str = "runtime",
    *,
    run_id: str | None = None,
    log_dir: str | os.PathLike[str] | None = None,
    start_event: str | None = None,
    end_event: str | None = None,
    start_data: Mapping[str, Any] | None = None,
) -> Iterator[GenoLeWMLogger]:
    """Open a sink for the run; flush on exit; never swallow exceptions.

    The wrapper guarantees that any records emitted up to a crash are
    flushed to disk (INV-OBS-6: "a crash before logger init still
    produces a sanitized minimal record"). Optional ``start_event`` /
    ``end_event`` book-end the run. If the block raises and the
    exception is a ``geno_lewm.errors.GenoLeWMError``, an ``error``
    record is emitted before the exception propagates.
    """
    logger = get_logger(component, run_id=run_id, log_dir=log_dir)
    if start_event:
        logger.info(start_event, **(dict(start_data) if start_data else {}))
    try:
        yield logger
    except BaseException as exc:
        from geno_lewm.errors import GenoLeWMError  # local import to avoid cycle

        if isinstance(exc, GenoLeWMError):
            logger.error(
                "error",
                error_code=exc.code,
                message=exc.message,
                details=exc.details,
                remediation=exc.remediation,
            )
        # Always flush the sink before the exception unwinds the stack.
        with contextlib.suppress(Exception):
            logger._sink.flush()
        raise
    else:
        if end_event:
            logger.info(end_event)

set_trace_context

set_trace_context(*, trace_id: str | None, span_id: str | None) -> Iterator[None]

Push (trace_id, span_id) into the contextvar for the block.

Source code in geno_lewm/observability.py
@contextlib.contextmanager
def set_trace_context(*, trace_id: str | None, span_id: str | None) -> Iterator[None]:
    """Push ``(trace_id, span_id)`` into the contextvar for the block."""
    t_tok = _TRACE_ID.set(trace_id)
    s_tok = _SPAN_ID.set(span_id)
    try:
        yield
    finally:
        _TRACE_ID.reset(t_tok)
        _SPAN_ID.reset(s_tok)

canonical_json_sha256

canonical_json_sha256(value: Any) -> str

Return "sha256:<hex>" for the canonical JSON of value.

Source code in geno_lewm/provenance/hashing.py
def canonical_json_sha256(value: Any) -> str:
    """Return ``"sha256:<hex>"`` for the canonical JSON of ``value``."""
    return _PREFIX + hashlib.sha256(canonical_json_bytes(value)).hexdigest()

compute_input_commitment

compute_input_commitment(reference_window: str, edit_spec: EditSpec, pooling_config: PoolingConfig, dtype_config: DtypeConfig) -> str

Return the "sha256:<hex>" input commitment for a scoring call.

The canonical payload is a dict with fixed keys; canonical-JSON encoding handles ordering and stability.

Source code in geno_lewm/provenance/commitment.py
def compute_input_commitment(
    reference_window: str,
    edit_spec: EditSpec,
    pooling_config: PoolingConfig,
    dtype_config: DtypeConfig,
) -> str:
    """Return the ``"sha256:<hex>"`` input commitment for a scoring call.

    The canonical payload is a dict with fixed keys; canonical-JSON
    encoding handles ordering and stability.
    """
    if not isinstance(reference_window, str):
        raise InputError(
            "reference_window must be a string of bases",
            details={"type": type(reference_window).__name__},
        )
    if not reference_window:
        raise InputError(
            "reference_window must be non-empty",
            details={"len": 0},
        )

    payload = {
        "reference_window": reference_window,
        "edit_spec": _editspec_to_commit_dict(edit_spec),
        "pooling_config": {
            "state_layer": pooling_config.state_layer,
            "pool_type": pooling_config.pool_type,
            "pool_radius": pooling_config.pool_radius,
            "normalize": pooling_config.normalize,
        },
        "dtype_config": {
            "encoder_dtype": dtype_config.encoder_dtype,
            "predictor_dtype": dtype_config.predictor_dtype,
        },
        "version": 1,
    }
    return canonical_json_sha256(payload)

compute_output_commitment

compute_output_commitment(output: ReceiptOutput) -> str

Compute the output-commitment hash for an output block.

Separated from Receipt so callers can pre-compute the commitment before assembling the receipt.

Source code in geno_lewm/provenance/receipt.py
def compute_output_commitment(output: ReceiptOutput) -> str:
    """Compute the output-commitment hash for an output block.

    Separated from ``Receipt`` so callers can pre-compute the
    commitment before assembling the receipt.
    """
    return canonical_json_sha256(asdict(output))

load_manifest

load_manifest(path: str | Path) -> Manifest

Load and validate a manifest from disk.

Source code in geno_lewm/provenance/manifest.py
def load_manifest(path: str | Path) -> Manifest:
    """Load and validate a manifest from disk."""
    p = Path(path)
    raw = p.read_bytes()
    try:
        d = json.loads(raw.decode("utf-8"))
    except json.JSONDecodeError as exc:
        raise InputError(
            "manifest is not valid JSON",
            details={"path": str(p), "error": str(exc)},
        ) from exc
    return _from_dict(d)

read_receipt

read_receipt(path: str | Path) -> Receipt

Load and validate a receipt from disk.

Source code in geno_lewm/provenance/receipt.py
def read_receipt(path: str | Path) -> Receipt:
    """Load and validate a receipt from disk."""
    p = Path(path)
    raw = p.read_bytes()
    try:
        d = json.loads(raw.decode("utf-8"))
    except json.JSONDecodeError as exc:
        raise ReceiptSchemaError(
            "receipt is not valid JSON",
            details={"path": str(p), "error": str(exc)},
        ) from exc

    if not isinstance(d, dict):
        raise ReceiptSchemaError(
            "receipt top-level must be an object",
            details={"path": str(p), "type": type(d).__name__},
        )
    return parse_receipt_payload(d)

sha256_bytes

sha256_bytes(data: bytes | bytearray | memoryview) -> str

Return "sha256:<hex>" for data.

Source code in geno_lewm/provenance/hashing.py
def sha256_bytes(data: bytes | bytearray | memoryview) -> str:
    """Return ``"sha256:<hex>"`` for ``data``."""
    return _PREFIX + hashlib.sha256(bytes(data)).hexdigest()

sha256_file

sha256_file(path: str | Path) -> str

Return "sha256:<hex>" for the file at path.

Streams the file in 1 MiB chunks; safe for arbitrarily large artifacts (weights files can be multi-GB).

Source code in geno_lewm/provenance/hashing.py
def sha256_file(path: str | Path) -> str:
    """Return ``"sha256:<hex>"`` for the file at ``path``.

    Streams the file in 1 MiB chunks; safe for arbitrarily large
    artifacts (weights files can be multi-GB).
    """
    p = Path(path)
    h = hashlib.sha256()
    with p.open("rb") as f:
        while True:
            chunk = f.read(_CHUNK)
            if not chunk:
                break
            h.update(chunk)
    return _PREFIX + h.hexdigest()

write_manifest

write_manifest(manifest: Manifest, path: str | Path) -> Path

Write a manifest to disk as canonical JSON.

The on-disk bytes are byte-stable across platforms.

Source code in geno_lewm/provenance/manifest.py
def write_manifest(manifest: Manifest, path: str | Path) -> Path:
    """Write a manifest to disk as canonical JSON.

    The on-disk bytes are byte-stable across platforms.
    """
    p = Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    p.write_bytes(manifest.to_canonical_json())
    return p

write_receipt

write_receipt(receipt: Receipt, path: str | Path) -> Path

Write a receipt as canonical JSON; round-trip byte-stable.

Source code in geno_lewm/provenance/receipt.py
def write_receipt(receipt: Receipt, path: str | Path) -> Path:
    """Write a receipt as canonical JSON; round-trip byte-stable."""
    p = Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    p.write_bytes(receipt.to_canonical_json())
    return p

score_variant

score_variant(variant: EditSpec, encoder: object, action_encoder: object, predictor: object, calibration: CalibrationTable, *, reference_window: str, window_start_bp: int = 0, region: str | Sequence[str] | None = None, repeat: str | Sequence[str] | None = None, aggregation: str = 'mean', min_bucket_size: int = DEFAULT_MIN_BUCKET_SIZE) -> SurpriseResult

Score one edit against a caller-supplied reference window.

The scorer is intentionally model-object agnostic: callers can pass the concrete training-time modules or small deterministic fakes. FASTA-backed window extraction is available through :func:score_vcf; checkpoint loading is owned by higher runtime layers.

Source code in geno_lewm/surprise/score.py
def score_variant(
    variant: EditSpec,
    encoder: object,
    action_encoder: object,
    predictor: object,
    calibration: CalibrationTable,
    *,
    reference_window: str,
    window_start_bp: int = 0,
    region: str | Sequence[str] | None = None,
    repeat: str | Sequence[str] | None = None,
    aggregation: str = "mean",
    min_bucket_size: int = DEFAULT_MIN_BUCKET_SIZE,
) -> SurpriseResult:
    """Score one edit against a caller-supplied reference window.

    The scorer is intentionally model-object agnostic: callers can pass
    the concrete training-time modules or small deterministic fakes.
    FASTA-backed window extraction is available through :func:`score_vcf`;
    checkpoint loading is owned by higher runtime layers.
    """
    _require_calibration_table(calibration)
    min_size = _require_positive_int("min_bucket_size", min_bucket_size)
    bucket_id, sigma_raw = _raw_surprise(
        variant,
        encoder,
        action_encoder,
        predictor,
        reference_window=reference_window,
        window_start_bp=window_start_bp,
        region=region,
        repeat=repeat,
        aggregation=aggregation,
    )
    bucket = calibration.resolve(bucket_id, min_bucket_size=min_size)
    return SurpriseResult(
        sigma_raw=sigma_raw,
        sigma_calibrated=_cdf_percentile(bucket, sigma_raw),
        bucket_id=bucket.bucket_id,
        confidence=bucket.confidence,
        low_confidence=bucket.low_confidence,
    )

score_vcf

score_vcf(vcf_path: str | Path, encoder: object, action_encoder: object, predictor: object, calibration: CalibrationTable, output_path: str | Path, *, reference_windows: Mapping[str, str] | None = None, reference_fasta: str | Path | None = None, window_bp: int = DEFAULT_WINDOW_BP, window_start_bp: int = 0, region: str | Sequence[str] | None = None, repeat: str | Sequence[str] | None = None, aggregation: str = 'mean', show_progress: bool = True, batch_size: int = 64, min_bucket_size: int = DEFAULT_MIN_BUCKET_SIZE) -> Path

Score VCF rows and write one JSON object per scored alternate.

Pass reference_fasta for local FASTA-backed window extraction. reference_windows remains useful for tests and already-extracted windows. Mapping keys are tried in this order: chrom:pos:ref:alt, chrom:pos, then chrom.

Source code in geno_lewm/surprise/score.py
def score_vcf(
    vcf_path: str | Path,
    encoder: object,
    action_encoder: object,
    predictor: object,
    calibration: CalibrationTable,
    output_path: str | Path,
    *,
    reference_windows: Mapping[str, str] | None = None,
    reference_fasta: str | Path | None = None,
    window_bp: int = DEFAULT_WINDOW_BP,
    window_start_bp: int = 0,
    region: str | Sequence[str] | None = None,
    repeat: str | Sequence[str] | None = None,
    aggregation: str = "mean",
    show_progress: bool = True,
    batch_size: int = 64,
    min_bucket_size: int = DEFAULT_MIN_BUCKET_SIZE,
) -> Path:
    """Score VCF rows and write one JSON object per scored alternate.

    Pass ``reference_fasta`` for local FASTA-backed window extraction.
    ``reference_windows`` remains useful for tests and already-extracted
    windows. Mapping keys are tried in this order:
    ``chrom:pos:ref:alt``, ``chrom:pos``, then ``chrom``.
    """
    if not isinstance(show_progress, bool):
        raise InputError(
            "show_progress must be a bool",
            details={"type": type(show_progress).__name__},
        )
    del show_progress

    output = Path(output_path)
    output.parent.mkdir(parents=True, exist_ok=True)
    with output.open("w", encoding="utf-8") as handle:
        for record in _iter_vcf_scores(
            vcf_path,
            encoder,
            action_encoder,
            predictor,
            calibration,
            reference_windows=reference_windows,
            reference_fasta=reference_fasta,
            window_bp=window_bp,
            window_start_bp=window_start_bp,
            region=region,
            repeat=repeat,
            aggregation=aggregation,
            batch_size=batch_size,
            min_bucket_size=min_bucket_size,
        ):
            variant = record.variant
            handle.write(
                json.dumps(
                    {
                        "schema_version": SCORE_JSONL_SCHEMA_VERSION,
                        "generated_by": SCORE_JSONL_GENERATED_BY,
                        "chrom": variant.chrom,
                        "pos": variant.pos,
                        "ref": variant.ref,
                        "alt": variant.alt,
                        **record.result.to_dict(),
                    },
                    sort_keys=True,
                )
                + "\n"
            )
    return output