Skip to content

geno_lewm.carbon_zero_shot

carbon_zero_shot

Carbon zero-shot baseline scoring artifacts for release evaluation.

CarbonZeroShotRecord dataclass

CarbonZeroShotRecord(chrom: str, pos: int, ref: str, alt: str, carbon_ref_log_likelihood: float, carbon_alt_log_likelihood: float, carbon_alt_minus_ref_log_likelihood: float, carbon_zero_shot_score: float, window_start_bp: int, window_bp: int, reference_window_sha256: str, alternate_window_sha256: str)

One Carbon zero-shot score row for geno-lewm-eval baseline input.

to_json_dict

to_json_dict() -> dict[str, str | int | float]

Return the JSONL row consumed by geno-lewm-eval.

Source code in geno_lewm/carbon_zero_shot.py
def to_json_dict(self) -> dict[str, str | int | float]:
    """Return the JSONL row consumed by ``geno-lewm-eval``."""
    return {
        "schema_version": CARBON_ZERO_SHOT_SCHEMA_VERSION,
        "generated_by": CARBON_ZERO_SHOT_GENERATED_BY,
        "chrom": self.chrom,
        "pos": self.pos,
        "ref": self.ref,
        "alt": self.alt,
        "carbon_ref_log_likelihood": self.carbon_ref_log_likelihood,
        "carbon_alt_log_likelihood": self.carbon_alt_log_likelihood,
        "carbon_alt_minus_ref_log_likelihood": self.carbon_alt_minus_ref_log_likelihood,
        CARBON_ZERO_SHOT_SCORE_FIELD: self.carbon_zero_shot_score,
        "window_start_bp": self.window_start_bp,
        "window_bp": self.window_bp,
        "reference_window_sha256": self.reference_window_sha256,
        "alternate_window_sha256": self.alternate_window_sha256,
    }

CarbonZeroShotSummary dataclass

CarbonZeroShotSummary(generated_by: str, generated_at: str, carbon_model: str, carbon_revision: str, vcf: str, fasta: str, output_scores: str, score_field: str, records: int, window_bp: int, logp_cache: str | None, logp_cache_entries: int, new_logp_evaluations: int, local_files_only: bool)

Machine-readable summary for a generated baseline score artifact.

to_json_dict

to_json_dict() -> dict[str, str | int | bool | None]

Return the JSON-native summary payload.

Source code in geno_lewm/carbon_zero_shot.py
def to_json_dict(self) -> dict[str, str | int | bool | None]:
    """Return the JSON-native summary payload."""
    return {
        "generated_by": self.generated_by,
        "generated_at": self.generated_at,
        "carbon_model": self.carbon_model,
        "carbon_revision": self.carbon_revision,
        "vcf": self.vcf,
        "fasta": self.fasta,
        "output_scores": self.output_scores,
        "score_field": self.score_field,
        "records": self.records,
        "window_bp": self.window_bp,
        "logp_cache": self.logp_cache,
        "logp_cache_entries": self.logp_cache_entries,
        "new_logp_evaluations": self.new_logp_evaluations,
        "local_files_only": self.local_files_only,
    }

CarbonLogLikelihoodScorer

CarbonLogLikelihoodScorer(model: object, tokenizer: object, *, torch: object, device: str | None = None)

Compute autoregressive Carbon log-likelihood for one DNA window.

Source code in geno_lewm/carbon_zero_shot.py
def __init__(
    self,
    model: object,
    tokenizer: object,
    *,
    torch: object,
    device: str | None = None,
) -> None:
    if not callable(tokenizer):
        raise InputError(
            "tokenizer must be callable",
            details={"type": type(tokenizer).__name__},
        )
    if not callable(model):
        raise InputError("model must be callable", details={"type": type(model).__name__})
    self.model = model
    self.tokenizer = tokenizer
    self.torch = torch
    self.device = device
    eval_method = getattr(model, "eval", None)
    if callable(eval_method):
        eval_method()
    if device is not None:
        to_method = getattr(model, "to", None)
        if callable(to_method):
            to_method(device)

__call__

__call__(sequence: str) -> float

Return summed next-token log-likelihood for a Carbon DNA window.

Source code in geno_lewm/carbon_zero_shot.py
def __call__(self, sequence: str) -> float:
    """Return summed next-token log-likelihood for a Carbon DNA window."""
    tokenizer = cast(Callable[..., Mapping[str, Any]], self.tokenizer)
    tokenized = tokenizer(wrap_dna_for_tokenizer(sequence), return_tensors="pt")
    if not isinstance(tokenized, Mapping):
        raise InputError(
            "Carbon tokenizer must return a mapping",
            details={"type": type(tokenized).__name__},
        )
    batch = _move_mapping(tokenized, self.device)
    input_ids = batch.get("input_ids")
    if input_ids is None:
        raise InputError("Carbon tokenizer output must include input_ids")
    shape = getattr(input_ids, "shape", None)
    if shape is None or len(shape) != 2 or shape[1] < 2:
        raise InputError("Carbon tokenizer input_ids must have shape [batch, seq>=2]")
    with _no_grad(self.torch):
        output = cast(Callable[..., object], self.model)(**dict(batch))
    logits = getattr(output, "logits", None)
    if logits is None and isinstance(output, tuple) and output:
        logits = output[0]
    if logits is None:
        raise RuntimeSetupError(
            "Carbon model output does not expose logits",
            remediation="load an autoregressive Carbon language-model head",
        )
    return _autoregressive_log_likelihood(
        torch=self.torch,
        logits=logits,
        input_ids=input_ids,
        attention_mask=batch.get("attention_mask"),
    )

load_carbon_logp_scorer

load_carbon_logp_scorer(model_dir: str | Path, *, revision: str = 'main', dtype: str = 'bf16', device: str | None = None, trust_remote_code: bool = False, local_files_only: bool = True) -> CarbonLogLikelihoodScorer

Load a local Carbon language-model scorer through Transformers.

Source code in geno_lewm/carbon_zero_shot.py
def load_carbon_logp_scorer(
    model_dir: str | Path,
    *,
    revision: str = "main",
    dtype: str = "bf16",
    device: str | None = None,
    trust_remote_code: bool = False,
    local_files_only: bool = True,
) -> CarbonLogLikelihoodScorer:
    """Load a local Carbon language-model scorer through Transformers."""
    try:
        transformers = importlib.import_module("transformers")
    except ImportError as exc:
        raise RuntimeSetupError(
            "Carbon zero-shot scoring requires Hugging Face Transformers",
            remediation="install geno-lewm[eval] plus transformers/torch, or run in train extra",
        ) from exc
    try:
        torch = importlib.import_module("torch")
    except ImportError as exc:
        raise RuntimeSetupError(
            "Carbon zero-shot scoring requires PyTorch",
            remediation="install geno-lewm[train] in the scoring environment",
        ) from exc

    tokenizer_cls = getattr(transformers, "AutoTokenizer", None)
    model_cls = getattr(transformers, "AutoModelForCausalLM", None)
    if tokenizer_cls is None or model_cls is None:
        raise RuntimeSetupError("transformers must expose AutoTokenizer and AutoModelForCausalLM")
    model_source = str(Path(model_dir).expanduser())
    common_kwargs = {
        "revision": revision,
        "local_files_only": local_files_only,
        "trust_remote_code": trust_remote_code,
    }
    tokenizer = tokenizer_cls.from_pretrained(model_source, **common_kwargs)
    model_kwargs = dict(common_kwargs)
    torch_dtype = _torch_dtype(torch, dtype)
    if torch_dtype is not None:
        model_kwargs["torch_dtype"] = torch_dtype
    model = model_cls.from_pretrained(model_source, **model_kwargs)
    return CarbonLogLikelihoodScorer(model, tokenizer, torch=torch, device=device)

write_carbon_zero_shot_scores

write_carbon_zero_shot_scores(*, vcf_path: str | Path, fasta_path: str | Path, output_scores: str | Path, scorer: Callable[[str], float], carbon_model: str, carbon_revision: str, window_bp: int = DEFAULT_WINDOW_BP, logp_cache_jsonl: str | Path | None = None, metadata_output: str | Path | None = None, generated_at: str | None = None, local_files_only: bool = True) -> CarbonZeroShotSummary

Write Carbon zero-shot baseline scores for all VCF alternate alleles.

Source code in geno_lewm/carbon_zero_shot.py
def write_carbon_zero_shot_scores(
    *,
    vcf_path: str | Path,
    fasta_path: str | Path,
    output_scores: str | Path,
    scorer: Callable[[str], float],
    carbon_model: str,
    carbon_revision: str,
    window_bp: int = DEFAULT_WINDOW_BP,
    logp_cache_jsonl: str | Path | None = None,
    metadata_output: str | Path | None = None,
    generated_at: str | None = None,
    local_files_only: bool = True,
) -> CarbonZeroShotSummary:
    """Write Carbon zero-shot baseline scores for all VCF alternate alleles."""
    output = Path(output_scores)
    output.parent.mkdir(parents=True, exist_ok=True)
    cache_path = None if logp_cache_jsonl is None else Path(logp_cache_jsonl)
    carbon_model_text = _required_text("carbon_model", carbon_model)
    carbon_revision_text = _required_text("carbon_revision", carbon_revision)
    logp_cache = _load_logp_cache(
        cache_path,
        carbon_model=carbon_model_text,
        carbon_revision=carbon_revision_text,
    )
    initial_cache_keys = frozenset(logp_cache)
    reference_sequences = _load_reference_fasta(fasta_path)
    generated = generated_at or datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")

    records = 0
    with output.open("w", encoding="utf-8") as handle:
        for variant in _iter_vcf_variants(vcf_path):
            record = _score_variant_window(
                variant,
                reference_sequences=reference_sequences,
                window_bp=window_bp,
                scorer=scorer,
                logp_cache=logp_cache,
            )
            handle.write(json.dumps(record.to_json_dict(), sort_keys=True) + "\n")
            records += 1
    if records == 0:
        raise InputError("VCF produced no Carbon zero-shot baseline rows")
    if cache_path is not None:
        _write_logp_cache(
            cache_path,
            logp_cache,
            carbon_model=carbon_model_text,
            carbon_revision=carbon_revision_text,
        )

    summary = CarbonZeroShotSummary(
        generated_by=CARBON_ZERO_SHOT_GENERATED_BY,
        generated_at=generated,
        carbon_model=carbon_model_text,
        carbon_revision=carbon_revision_text,
        vcf=str(vcf_path),
        fasta=str(fasta_path),
        output_scores=str(output),
        score_field=CARBON_ZERO_SHOT_SCORE_FIELD,
        records=records,
        window_bp=window_bp,
        logp_cache=None if cache_path is None else str(cache_path),
        logp_cache_entries=len(logp_cache),
        new_logp_evaluations=len(set(logp_cache) - initial_cache_keys),
        local_files_only=local_files_only,
    )
    if metadata_output is not None:
        metadata_path = Path(metadata_output)
        metadata_path.parent.mkdir(parents=True, exist_ok=True)
        metadata_path.write_text(
            json.dumps(summary.to_json_dict(), indent=2, sort_keys=True) + "\n",
            encoding="utf-8",
        )
    return summary