`geno_lewm.encoder`¶

encoder ¶

State-encoder input preparation and Carbon wrapper helpers.

The pure-Python windowing, pooling, and cache helpers import without the ML runtime. CarbonStateEncoder loads the optional Transformers stack only when callers construct it without injected model/tokenizer objects.

CacheLookupResult `dataclass` ¶

CacheLookupResult(embedding: tuple[float, ...], provenance: CacheProvenance)

One cached embedding together with its selected physical provenance.

CacheProvenance `dataclass` ¶

CacheProvenance(cache_schema_version: str, physical_encoding: str, shard_path: Path, row_offset: int)

Physical source selected for one logical cache-key lookup.

CacheReindexReport `dataclass` ¶

CacheReindexReport(indexed_shards: int, indexed_rows: int, index_path: Path)

Summary of a SQLite index rebuild.

CacheRepairReport `dataclass` ¶

CacheRepairReport(checked_shards: int, quarantined: tuple[Path, ...], reindex: CacheReindexReport)

Summary of a repair pass over Parquet shards.

CacheShardInspection `dataclass` ¶

CacheShardInspection(path: Path, records: tuple[WindowCacheRecord, ...], sha256: str, size_bytes: int)

Fully decoded immutable shard bytes and their exact file identity.

WindowCacheKey `dataclass` ¶

WindowCacheKey(window_hash: bytes, encoder_hash: bytes, state_layer: int, pool_type: str, pool_radius: int, center_token: int | None, dtype: str)

Content-addressed key for a cached embedding row.

WindowCacheRecord `dataclass` ¶

WindowCacheRecord(chrom: str, start_bp: int, end_bp: int, window_hash: bytes, encoder_hash: bytes, state_layer: int, pool_type: str, pool_radius: int, center_token: int | None, dtype: str, embedding: tuple[float, ...], untargeted: bool, created_at: int = 0, schema_version: str = CACHE_SCHEMA_VERSION)

One raw pooled row in the window-embedding cache schema.

Normalization is a consumer-side view and is intentionally absent from the cache key. Cache producers must never persist normalized states.

key `property` ¶

key: WindowCacheKey

Return the content-addressed key for this row.

with_created_at ¶

with_created_at() -> WindowCacheRecord

Fill created_at with current UTC nanoseconds when absent.

Source code in geno_lewm/encoder/cache.py

def with_created_at(self) -> WindowCacheRecord:
    """Fill ``created_at`` with current UTC nanoseconds when absent."""
    if self.created_at:
        return self
    return WindowCacheRecord(
        chrom=self.chrom,
        start_bp=self.start_bp,
        end_bp=self.end_bp,
        window_hash=self.window_hash,
        encoder_hash=self.encoder_hash,
        state_layer=self.state_layer,
        pool_type=self.pool_type,
        pool_radius=self.pool_radius,
        center_token=self.center_token,
        dtype=self.dtype,
        embedding=self.embedding,
        untargeted=self.untargeted,
        created_at=time.time_ns(),
        schema_version=self.schema_version,
    )

CacheBuildReport `dataclass` ¶

CacheBuildReport(report_path: Path, checksums_path: Path, payload: Mapping[str, object])

Completed evidence bundle and its JSON-native report payload.

to_dict ¶

to_dict() -> dict[str, object]

Return a detached JSON-native copy of the report.

Source code in geno_lewm/encoder/cache_build.py

def to_dict(self) -> dict[str, object]:
    """Return a detached JSON-native copy of the report."""
    return cast(dict[str, object], json.loads(json.dumps(self.payload)))

CarbonStateEncoder ¶

CarbonStateEncoder(model_id: str, revision: str, *, dtype: str = 'bf16', state_layer: int = -1, pool_type: str = POOL_CENTERED_MEAN, pool_radius: int = DEFAULT_POOL_RADIUS_TOKENS, normalize: bool = True, lora_config: object | None = None, model: object | None = None, tokenizer: object | None = None, encoder_hash: bytes | str | None = None, local_files_only: bool = True, trust_remote_code: bool = False, device: str | None = None)

Encode DNA windows with Carbon hidden states plus deterministic pooling.

Source code in geno_lewm/encoder/carbon.py

def __init__(
    self,
    model_id: str,
    revision: str,
    *,
    dtype: str = "bf16",
    state_layer: int = -1,
    pool_type: str = POOL_CENTERED_MEAN,
    pool_radius: int = DEFAULT_POOL_RADIUS_TOKENS,
    normalize: bool = True,
    lora_config: object | None = None,
    model: object | None = None,
    tokenizer: object | None = None,
    encoder_hash: bytes | str | None = None,
    local_files_only: bool = True,
    trust_remote_code: bool = False,
    device: str | None = None,
) -> None:
    if not model_id:
        raise InputError("model_id must be non-empty")
    if not revision:
        raise InputError("revision must be non-empty")
    if dtype not in _SUPPORTED_DTYPES:
        raise InputError(
            "unsupported encoder dtype",
            details={"dtype": dtype, "supported": sorted(_SUPPORTED_DTYPES)},
        )
    if not isinstance(state_layer, int) or isinstance(state_layer, bool):
        raise InputError(
            "state_layer must be an integer",
            details={"state_layer": state_layer, "type": type(state_layer).__name__},
        )
    if pool_type not in {POOL_CENTERED_MEAN, POOL_GLOBAL_MEAN}:
        raise InputError(
            "unsupported pool_type",
            details={
                "pool_type": pool_type,
                "supported": [POOL_CENTERED_MEAN, POOL_GLOBAL_MEAN],
            },
        )
    if not isinstance(pool_radius, int) or isinstance(pool_radius, bool) or pool_radius < 0:
        raise InputError(
            "pool_radius must be a non-negative integer",
            details={"pool_radius": pool_radius, "type": type(pool_radius).__name__},
        )
    if pool_type == POOL_GLOBAL_MEAN and pool_radius != 0:
        raise InputError(
            "global_mean pooling requires pool_radius=0",
            details={"pool_type": pool_type, "pool_radius": pool_radius},
        )
    if not isinstance(normalize, bool):
        raise InputError(
            "normalize must be bool",
            details={"type": type(normalize).__name__},
        )
    if lora_config is not None:
        raise RuntimeSetupError(
            "Carbon LoRA adapters are not supported by CarbonStateEncoder yet",
            remediation="merge LoRA adapters before loading or track the Phase 2 adapter issue",
        )
    if (model is None) != (tokenizer is None):
        raise InputError(
            "model and tokenizer must be supplied together",
            details={"model": model is not None, "tokenizer": tokenizer is not None},
        )

    self.model_id = model_id
    self.revision = revision
    self.dtype = dtype
    self.state_layer = state_layer
    self.pool_type = pool_type
    self.pool_radius = pool_radius
    self.normalize = normalize
    self.local_files_only = local_files_only
    self.trust_remote_code = trust_remote_code
    self.device = _resolve_device(device)
    self._encoder_hash = _coerce_encoder_hash(encoder_hash)
    self._d_state: int | None = None

    if model is None or tokenizer is None:
        if self._encoder_hash is not None:
            _verify_local_encoder_weights(model_id, expected_hash=self._encoder_hash)
        tokenizer, model = _load_transformers_components(
            model_id=model_id,
            revision=revision,
            dtype=dtype,
            local_files_only=local_files_only,
            trust_remote_code=trust_remote_code,
        )
    self.tokenizer = tokenizer
    self.model = model
    self._parameter_count, self._trainable_parameter_count = _freeze_module_parameters(
        self.model
    )
    _eval_if_available(self.model)
    _move_module_to_device(self.model, self.device)
    config = getattr(self.model, "config", None)
    hidden_size = getattr(config, "hidden_size", None)
    if isinstance(hidden_size, int) and not isinstance(hidden_size, bool) and hidden_size > 0:
        self._d_state = hidden_size

encoder_hash `property` ¶

encoder_hash: bytes

Return the configured encoder hash bytes.

d_state `property` ¶

d_state: int

Return the pooled state width when known.

parameter_count `property` ¶

parameter_count: int

Return the number of parameters exposed by the encoder module.

trainable_parameter_count `property` ¶

trainable_parameter_count: int

Return zero after the frozen-encoder contract is enforced.

encode ¶

encode(window: str, edit_locus: int | None = None) -> tuple[float, ...]

Encode and pool one DNA window.

Source code in geno_lewm/encoder/carbon.py

def encode(self, window: str, edit_locus: int | None = None) -> tuple[float, ...]:
    """Encode and pool one DNA window."""
    return self.encode_batch([window], [edit_locus])[0]

encode_batch ¶

encode_batch(windows: Sequence[str], edit_loci: Sequence[int | None]) -> tuple[tuple[float, ...], ...]

Encode and pool a batch of DNA windows.

Source code in geno_lewm/encoder/carbon.py

def encode_batch(
    self,
    windows: Sequence[str],
    edit_loci: Sequence[int | None],
) -> tuple[tuple[float, ...], ...]:
    """Encode and pool a batch of DNA windows."""
    if not isinstance(windows, Sequence) or isinstance(windows, str | bytes):
        raise InputError(
            "windows must be a sequence of DNA strings",
            details={"type": type(windows).__name__},
        )
    if not isinstance(edit_loci, Sequence) or isinstance(edit_loci, str | bytes):
        raise InputError(
            "edit_loci must be a sequence of int or None values",
            details={"type": type(edit_loci).__name__},
        )
    if len(windows) != len(edit_loci):
        raise InputError(
            "windows and edit_loci must have the same length",
            details={"windows": len(windows), "edit_loci": len(edit_loci)},
        )
    if not windows:
        raise InputError("windows must contain at least one sequence")

    normalized = tuple(canonicalize_dna(window) for window in windows)
    wrapped = [wrap_dna_for_tokenizer(window) for window in normalized]
    tokenized = _tokenize(self.tokenizer, wrapped)
    layouts = _resolve_dna_token_layouts(
        self.tokenizer,
        tokenized,
        sequences=normalized,
    )
    tokenized = _move_inputs_to_device(tokenized, self.device)
    with torch_inference_context():
        output = _call_model(self.model, tokenized)
    rows_by_item = _hidden_rows_by_item(output, state_layer=self.state_layer)
    if len(rows_by_item) != len(windows):
        raise InputError(
            "encoder output batch size does not match input windows",
            details={"expected": len(windows), "observed": len(rows_by_item)},
        )

    pooled_rows: list[tuple[float, ...]] = []
    for rows, edit_locus, layout, sequence in zip(
        rows_by_item,
        edit_loci,
        layouts,
        normalized,
        strict=True,
    ):
        if len(rows) != layout.padded_token_count:
            raise InputError(
                "encoder hidden-state length does not match tokenized input",
                details={
                    "hidden_tokens": len(rows),
                    "tokenized_tokens": layout.padded_token_count,
                },
            )
        center_token = layout.center_token(edit_locus, sequence_bp=len(sequence))
        pooled_rows.append(
            pool_hidden_states(
                rows[: layout.active_token_count],
                edit_locus=edit_locus,
                center_token=center_token,
                content_token_bounds=(
                    layout.dna_content_start,
                    layout.dna_content_start + layout.dna_content_count,
                ),
                pool_type=self.pool_type,
                pool_radius=self.pool_radius,
            ).vector
        )
    pooled = tuple(pooled_rows)
    encoded = (
        tuple(
            l2_normalize_state(vector, item_index=index) for index, vector in enumerate(pooled)
        )
        if self.normalize
        else pooled
    )
    if encoded:
        self._d_state = len(encoded[0])
    return encoded

pooling_identity ¶

pooling_identity(window: str, edit_locus: int | None) -> tuple[str, int, int | None]

Resolve the exact cache pooling identity from Carbon token IDs.

Source code in geno_lewm/encoder/carbon.py

def pooling_identity(
    self,
    window: str,
    edit_locus: int | None,
) -> tuple[str, int, int | None]:
    """Resolve the exact cache pooling identity from Carbon token IDs."""
    sequence = canonicalize_dna(window)
    tokenized = _tokenize(self.tokenizer, [wrap_dna_for_tokenizer(sequence)])
    layout = _resolve_dna_token_layouts(
        self.tokenizer,
        tokenized,
        sequences=(sequence,),
    )[0]
    center_token = layout.center_token(edit_locus, sequence_bp=len(sequence))
    if edit_locus is None:
        return POOL_GLOBAL_MEAN, 0, None
    if self.pool_type == POOL_GLOBAL_MEAN:
        return POOL_GLOBAL_MEAN, 0, None
    return POOL_CENTERED_MEAN, self.pool_radius, center_token

PoolingResult `dataclass` ¶

PoolingResult(vector: tuple[float, ...], pool_type: Literal['centered_mean', 'global_mean'], pool_radius: int, untargeted: bool, center_token: int | None, token_count: int)

Pooled state vector plus cache-key metadata.

d_state `property` ¶

d_state: int

Return the pooled vector width.

as_cache_fields ¶

as_cache_fields() -> Mapping[str, object]

Return fields shared with the window-cache schema.

Source code in geno_lewm/encoder/pooling.py

def as_cache_fields(self) -> Mapping[str, object]:
    """Return fields shared with the window-cache schema."""
    return {
        "pool_type": self.pool_type,
        "pool_radius": self.pool_radius,
        "untargeted": self.untargeted,
    }

EncoderRuntimeIdentity `dataclass` ¶

EncoderRuntimeIdentity(model_id: str, revision: str, state_contract_version: str, runtime_hash: str, weights_hash: str | None = None, schema_version: str = ENCODER_RUNTIME_IDENTITY_SCHEMA_VERSION)

Pinned model and byte identities required for cache construction.

cache_identity_hash `property` ¶

cache_identity_hash: str

Return the identity committed into cache keys for this state contract.

to_dict ¶

to_dict() -> dict[str, object]

Return the canonical JSON-native contract payload.

Source code in geno_lewm/encoder/runtime_identity.py

def to_dict(self) -> dict[str, object]:
    """Return the canonical JSON-native contract payload."""
    payload: dict[str, object] = {
        "schema_version": self.schema_version,
        "model_id": self.model_id,
        "revision": self.revision,
        "state_contract_version": self.state_contract_version,
        "runtime_hash": self.runtime_hash,
    }
    if self.weights_hash is not None:
        payload["weights_hash"] = self.weights_hash
    return payload

ExtractedWindow `dataclass` ¶

ExtractedWindow(sequence: str, start_bp: int, end_bp: int, window_bp: int, edit_locus: int | None = None, relative_edit_locus: int | None = None, pad_right_bp: int = 0)

A fixed-size DNA window plus its source-coordinate metadata.

start_bp and end_bp are 0-based half-open coordinates in the caller's source coordinate system. end_bp - start_bp always equals window_bp even when the sequence had to be right-padded past the available source bases; pad_right_bp records how many trailing A bases were introduced.

untargeted `property` ¶

untargeted: bool

Return true when the window was not centered on an edit.

sha256 `property` ¶

sha256: bytes

SHA-256 digest of the canonical window sequence.

as_tokenizer_input ¶

as_tokenizer_input() -> str

Return the Carbon tokenizer input string for this window.

Source code in geno_lewm/encoder/windowing.py

def as_tokenizer_input(self) -> str:
    """Return the Carbon tokenizer input string for this window."""
    return wrap_dna_for_tokenizer(self.sequence)

default_cache_dir ¶

default_cache_dir() -> Path

Return $GENO_LEWM_CACHE or the documented local default.

Source code in geno_lewm/encoder/cache.py

def default_cache_dir() -> Path:
    """Return ``$GENO_LEWM_CACHE`` or the documented local default."""
    return Path(os.environ.get("GENO_LEWM_CACHE", ".geno-lewm-cache")).expanduser()

inspect_cache_shard ¶

inspect_cache_shard(cache_dir: Path | str, shard_path: Path | str) -> CacheShardInspection

Decode and hash one shard through one no-follow file descriptor.

The caller may pass a cache-relative path or an absolute path inside the cache root. Every physical row and embedding is decoded and validated. The digest and size describe the same held inode, so resume checks do not rely on a path-following time-of-check/time-of-use sequence.

Source code in geno_lewm/encoder/cache.py

def inspect_cache_shard(
    cache_dir: Path | str,
    shard_path: Path | str,
) -> CacheShardInspection:
    """Decode and hash one shard through one no-follow file descriptor.

    The caller may pass a cache-relative path or an absolute path inside the
    cache root. Every physical row and embedding is decoded and validated.
    The digest and size describe the same held inode, so resume checks do not
    rely on a path-following time-of-check/time-of-use sequence.
    """
    root = Path(cache_dir).absolute()
    supplied = Path(shard_path)
    path = supplied if supplied.is_absolute() else root / supplied
    _require_secure_cache_io()
    _assert_safe_namespace_path(root, path, final_kind="regular file")
    with _secure_parent_directory(root, path, create=False) as parent_fd:
        if parent_fd is None:
            raise CacheCorruptError(
                "cache shard parent disappeared during inspection",
                details={"shard_path": str(path)},
            )
        descriptor = _open_regular_at(parent_fd, path.name, label="cache shard")
        if descriptor is None:
            raise CacheCorruptError(
                "cache shard disappeared during inspection",
                details={"shard_path": str(path)},
            )
        try:
            before = os.fstat(descriptor)
            digest = _sha256_descriptor(descriptor)
            records = _read_records_from_descriptor(descriptor, path=path)
            after = os.fstat(descriptor)
            _verify_directory_binding(path.parent, parent_fd)
            _verify_regular_name_binding(
                parent_fd,
                path.name,
                descriptor,
                label="cache shard",
            )
        finally:
            os.close(descriptor)
    if (
        before.st_dev,
        before.st_ino,
        before.st_size,
        before.st_mtime_ns,
    ) != (
        after.st_dev,
        after.st_ino,
        after.st_size,
        after.st_mtime_ns,
    ):
        raise CacheCorruptError(
            "cache shard changed during inspection",
            details={"shard_path": str(path)},
        )
    return CacheShardInspection(
        path=path,
        records=records,
        sha256=digest,
        size_bytes=before.st_size,
    )

read_cache_entries ¶

read_cache_entries(cache_dir: Path | str, keys: Sequence[WindowCacheKey], *, policy: CacheReadPolicy = 'require_v3') -> tuple[CacheLookupResult | None, ...]

Return cache entries in request order under an explicit provenance policy.

Source code in geno_lewm/encoder/cache.py

def read_cache_entries(
    cache_dir: Path | str,
    keys: Sequence[WindowCacheKey],
    *,
    policy: CacheReadPolicy = "require_v3",
) -> tuple[CacheLookupResult | None, ...]:
    """Return cache entries in request order under an explicit provenance policy."""
    locations = resolve_cache_provenances(cache_dir, keys, policy=policy)
    if not keys:
        return ()
    root = Path(cache_dir)
    locations_by_key = {
        key: provenance
        for key, provenance in zip(keys, locations, strict=True)
        if provenance is not None
    }
    requests_by_shard: dict[Path, list[tuple[int, WindowCacheKey, int]]] = defaultdict(list)
    for result_index, key in enumerate(keys):
        provenance = locations_by_key.get(key)
        if provenance is not None:
            requests_by_shard[provenance.shard_path].append(
                (result_index, key, provenance.row_offset)
            )
    results: list[CacheLookupResult | None] = [None] * len(keys)
    for shard_path, requests in requests_by_shard.items():
        requested_offsets = {request[2] for request in requests}
        records = _read_records_at_offsets(
            shard_path,
            requested_offsets,
            cache_dir=root,
        )
        missing_offsets = requested_offsets - records.keys()
        if missing_offsets:
            raise CacheCorruptError(
                "cache index row_offset could not be resolved in shard",
                details={
                    "shard_path": str(shard_path),
                    "row_offsets": sorted(missing_offsets),
                },
            )
        for result_index, key, row_offset in requests:
            record = records[row_offset]
            if record.key != key:
                raise CacheCorruptError(
                    "cache index key does not match shard row",
                    details={"shard_path": str(shard_path), "row_offset": row_offset},
                )
            provenance = locations_by_key[key]
            if record.schema_version != provenance.cache_schema_version:
                raise CacheCorruptError(
                    "cache index provenance does not match shard row",
                    details={"shard_path": str(shard_path), "row_offset": row_offset},
                )
            results[result_index] = CacheLookupResult(
                embedding=record.embedding,
                provenance=provenance,
            )
    return tuple(results)

read_cache_entry ¶

read_cache_entry(cache_dir: Path | str, key: WindowCacheKey, *, policy: CacheReadPolicy = 'require_v3') -> CacheLookupResult | None

Return one embedding and its selected cache provenance.

Source code in geno_lewm/encoder/cache.py

def read_cache_entry(
    cache_dir: Path | str,
    key: WindowCacheKey,
    *,
    policy: CacheReadPolicy = "require_v3",
) -> CacheLookupResult | None:
    """Return one embedding and its selected cache provenance."""
    return read_cache_entries(cache_dir, (key,), policy=policy)[0]

read_embedding ¶

read_embedding(cache_dir: Path | str, key: WindowCacheKey, *, policy: CacheReadPolicy = 'require_v3') -> tuple[float, ...] | None

Return a raw pooled embedding by content key, or None on cache miss.

Source code in geno_lewm/encoder/cache.py

def read_embedding(
    cache_dir: Path | str,
    key: WindowCacheKey,
    *,
    policy: CacheReadPolicy = "require_v3",
) -> tuple[float, ...] | None:
    """Return a raw pooled embedding by content key, or ``None`` on cache miss."""
    result = read_cache_entry(cache_dir, key, policy=policy)
    return None if result is None else result.embedding

read_embeddings ¶

read_embeddings(cache_dir: Path | str, keys: Sequence[WindowCacheKey], *, policy: CacheReadPolicy = 'require_v3') -> tuple[tuple[float, ...] | None, ...]

Return raw embeddings for keys in order, grouping reads by shard.

Duplicate keys and misses are preserved in the returned tuple. Only the Parquet row groups containing requested rows are read.

Source code in geno_lewm/encoder/cache.py

def read_embeddings(
    cache_dir: Path | str,
    keys: Sequence[WindowCacheKey],
    *,
    policy: CacheReadPolicy = "require_v3",
) -> tuple[tuple[float, ...] | None, ...]:
    """Return raw embeddings for ``keys`` in order, grouping reads by shard.

    Duplicate keys and misses are preserved in the returned tuple. Only the
    Parquet row groups containing requested rows are read.
    """
    return tuple(
        None if result is None else result.embedding
        for result in read_cache_entries(cache_dir, keys, policy=policy)
    )

reindex_cache ¶

reindex_cache(cache_dir: Path | str) -> CacheReindexReport

Rebuild index.sqlite from every readable Parquet shard.

Source code in geno_lewm/encoder/cache.py

def reindex_cache(cache_dir: Path | str) -> CacheReindexReport:
    """Rebuild ``index.sqlite`` from every readable Parquet shard."""
    root = Path(cache_dir)
    _require_secure_cache_io()
    with _cache_publication_lock(root):
        return _reindex_cache_locked(root)

repair_cache ¶

repair_cache(cache_dir: Path | str) -> CacheRepairReport

Quarantine unreadable Parquet shards and rebuild the SQLite index.

Source code in geno_lewm/encoder/cache.py

def repair_cache(cache_dir: Path | str) -> CacheRepairReport:
    """Quarantine unreadable Parquet shards and rebuild the SQLite index."""
    root = Path(cache_dir)
    _require_secure_cache_io()
    quarantined: list[Path] = []
    checked = 0
    with _cache_publication_lock(root):
        for shard in list(_iter_shards(root)):
            checked += 1
            try:
                _read_records_from_cache_shard(root, shard)
            except CacheCorruptError:
                quarantined.append(_quarantine_shard(root, shard))
        report = _reindex_cache_locked(root)
    return CacheRepairReport(
        checked_shards=checked,
        quarantined=tuple(quarantined),
        reindex=report,
    )

shard_path_for ¶

shard_path_for(cache_dir: Path | str, *, encoder_id: str, state_layer: int, pool_type: str, pool_radius: int, contig: str, stride_block: int, encoder_hash: bytes | None = None, dtype: str | None = None) -> Path

Return the canonical Parquet shard path for a cache block.

Supplying encoder_hash and dtype selects the collision-safe v3 namespace. Omitting both preserves the legacy v2 path contract for callers that need to locate existing artifacts.

Source code in geno_lewm/encoder/cache.py

def shard_path_for(
    cache_dir: Path | str,
    *,
    encoder_id: str,
    state_layer: int,
    pool_type: str,
    pool_radius: int,
    contig: str,
    stride_block: int,
    encoder_hash: bytes | None = None,
    dtype: str | None = None,
) -> Path:
    """Return the canonical Parquet shard path for a cache block.

    Supplying ``encoder_hash`` and ``dtype`` selects the collision-safe v3
    namespace. Omitting both preserves the legacy v2 path contract for
    callers that need to locate existing artifacts.
    """
    _validate_state_layer(state_layer)
    _validate_pool(pool_type, pool_radius)
    if not contig:
        raise InputError("contig must be non-empty")
    if not isinstance(stride_block, int) or isinstance(stride_block, bool) or stride_block < 0:
        raise InputError(
            "stride_block must be a non-negative integer",
            details={"stride_block": stride_block},
        )
    root = Path(cache_dir)
    if (encoder_hash is None) != (dtype is None):
        raise InputError("encoder_hash and dtype must be supplied together")
    if encoder_hash is not None and dtype is not None:
        encoder_part = _digest_path_part("id", encoder_id)
        contig_part = _digest_path_part("ctg", contig)
        encoder_hash_part = _hash_path_part("encoder_hash", encoder_hash)
        _validate_dtype(dtype)
        return (
            root
            / _EMBEDDINGS_DIR
            / "v3"
            / encoder_part
            / encoder_hash_part
            / f"{dtype}_as_{_STORAGE_DTYPE}"
            / str(state_layer)
            / f"{pool_type}_{pool_radius}"
            / f"{contig_part}_{stride_block}.parquet"
        )
    encoder_part = _legacy_path_part(encoder_id)
    contig_part = _legacy_path_part(contig)
    return (
        root
        / _EMBEDDINGS_DIR
        / encoder_part
        / str(state_layer)
        / f"{pool_type}_{pool_radius}"
        / f"chr{contig_part}_{stride_block}.parquet"
    )

write_shard ¶

write_shard(cache_dir: Path | str, *, encoder_id: str, contig: str, stride_block: int, records: Sequence[WindowCacheRecord]) -> Path

Write one immutable Parquet shard and index its rows.

If the shard already exists with the same rows, this is a no-op. If it exists and new or conflicting rows are supplied, the function raises instead of rewriting in place (INV-DATA-3 / INV-DATA-10).

Source code in geno_lewm/encoder/cache.py

def write_shard(
    cache_dir: Path | str,
    *,
    encoder_id: str,
    contig: str,
    stride_block: int,
    records: Sequence[WindowCacheRecord],
) -> Path:
    """Write one immutable Parquet shard and index its rows.

    If the shard already exists with the same rows, this is a no-op.
    If it exists and new or conflicting rows are supplied, the function
    raises instead of rewriting in place (INV-DATA-3 / INV-DATA-10).
    """
    if not records:
        raise InputError("records must contain at least one cache row")
    requested = tuple(records)
    normalized = tuple(_record_for_storage(record.with_created_at()) for record in requested)
    first = normalized[0]
    if first.schema_version != CACHE_SCHEMA_VERSION:
        raise InputError(
            "write_shard only writes the current cache schema",
            details={"schema_version": first.schema_version, "supported": CACHE_SCHEMA_VERSION},
        )
    if any(record.chrom != contig for record in normalized):
        raise InputError("all records in a shard must match the contig argument")
    if any(record.state_layer != first.state_layer for record in normalized):
        raise InputError("all records in a shard must share state_layer")
    if any(record.pool_type != first.pool_type for record in normalized):
        raise InputError("all records in a shard must share pool_type")
    if any(record.pool_radius != first.pool_radius for record in normalized):
        raise InputError("all records in a shard must share pool_radius")
    if any(record.encoder_hash != first.encoder_hash for record in normalized):
        raise InputError("all records in a shard must share encoder_hash")
    if any(record.dtype != first.dtype for record in normalized):
        raise InputError("all records in a shard must share dtype")
    if any(record.schema_version != first.schema_version for record in normalized):
        raise InputError("all records in a shard must share schema_version")
    if any(len(record.embedding) != len(first.embedding) for record in normalized):
        raise InputError("all records in a shard must share embedding width")
    keys = tuple(record.key for record in normalized)
    if len(set(keys)) != len(keys):
        raise InputError("records must not contain a duplicate cache key")

    root = Path(cache_dir)
    path = shard_path_for(
        root,
        encoder_id=encoder_id,
        state_layer=first.state_layer,
        pool_type=first.pool_type,
        pool_radius=first.pool_radius,
        contig=contig,
        stride_block=stride_block,
        encoder_hash=first.encoder_hash,
        dtype=first.dtype,
    )
    _require_secure_cache_io()
    with _cache_publication_lock(root):
        recovered = _recover_pending_publication(root)
        if recovered is not None and _recovered_publication_matches(
            recovered,
            requested=requested,
            normalized=normalized,
        ):
            return recovered.path
        with _open_direct_index_database(root, create=True, write=True) as conn:
            if conn is None:  # create=True guarantees a database or raises.
                raise CacheCorruptError("cache index could not be reserved")
            written = _write_shard_locked(
                root=root,
                path=path,
                requested=requested,
                normalized=normalized,
                index=conn,
            )
        _clear_pending_publication(root)
        return written

build_window_cache ¶

build_window_cache(*, requests_jsonl: Path | str | bytes, cache_dir: Path | str, evidence_dir: Path | str, encoder: object, encoder_id: str, batch_size: int, rows_per_shard: int, created_at_ns: int, hardware: str, resolved_config: Mapping[str, object], encoder_runtime_identity: Mapping[str, object], input_artifacts: Mapping[str, Path | str | bytes] | None = None, logger: GenoLeWMLogger | None = None) -> CacheBuildReport

Build or resume the cache for one exact request JSONL artifact.

encoder must expose the raw-state CarbonStateEncoder contract: pooling_identity, encode_batch, an exact 32-byte encoder_hash, and normalize is False. Existing planned shards are decoded, hashed, compared row-for-row with the plan, and re-indexed before any missing shard is encoded.

Source code in geno_lewm/encoder/cache_build.py

def build_window_cache(
    *,
    requests_jsonl: Path | str | bytes,
    cache_dir: Path | str,
    evidence_dir: Path | str,
    encoder: object,
    encoder_id: str,
    batch_size: int,
    rows_per_shard: int,
    created_at_ns: int,
    hardware: str,
    resolved_config: Mapping[str, object],
    encoder_runtime_identity: Mapping[str, object],
    input_artifacts: Mapping[str, Path | str | bytes] | None = None,
    logger: GenoLeWMLogger | None = None,
) -> CacheBuildReport:
    """Build or resume the cache for one exact request JSONL artifact.

    ``encoder`` must expose the raw-state ``CarbonStateEncoder`` contract:
    ``pooling_identity``, ``encode_batch``, an exact 32-byte ``encoder_hash``,
    and ``normalize is False``. Existing planned shards are decoded, hashed,
    compared row-for-row with the plan, and re-indexed before any missing shard
    is encoded.
    """
    started = time.perf_counter()
    batch_size = _positive_int("batch_size", batch_size)
    rows_per_shard = _positive_int("rows_per_shard", rows_per_shard)
    created_at_ns = _non_negative_int("created_at_ns", created_at_ns)
    if created_at_ns == 0:
        raise InputError("created_at_ns must be fixed to a positive UTC nanosecond value")
    if type(encoder_id) is not str or not encoder_id:
        raise InputError("encoder_id must be non-empty text")
    hardware = _text(hardware, field="hardware")
    resolved_config_payload = _json_object_copy(resolved_config, field="resolved_config")
    resolved_config_bytes = _pretty_json_bytes(resolved_config_payload)

    contract = _encoder_contract(encoder)
    encoder_device = _text(getattr(encoder, "device", None), field="encoder.device")
    runtime_identity_payload = _encoder_runtime_identity_payload(
        encoder_runtime_identity,
        contract=contract,
        encoder_id=encoder_id,
        resolved_config=resolved_config_payload,
    )
    runtime_identity_bytes = _pretty_json_bytes(runtime_identity_payload)
    request_bytes = (
        bytes(requests_jsonl)
        if isinstance(requests_jsonl, bytes)
        else _read_regular_bytes(Path(requests_jsonl), label="cache build requests")
    )
    requests = _parse_requests(request_bytes)
    request_identity = {
        "sha256": sha256_bytes(request_bytes),
        "size_bytes": len(request_bytes),
    }

    cache_root = Path(cache_dir).absolute()
    evidence_root = Path(evidence_dir).absolute()
    evidence = _EvidenceStore(evidence_root)
    staged_inputs = _read_input_artifacts(input_artifacts or {})
    input_identities = tuple(item.identity for item in staged_inputs)
    report_path = evidence_root / CACHE_BUILD_REPORT_NAME
    checksums_path = evidence_root / _CHECKSUMS_NAME
    resolved_config_identity = {
        "path": _RESOLVED_CONFIG_NAME,
        "sha256": sha256_bytes(resolved_config_bytes),
        "size_bytes": len(resolved_config_bytes),
    }
    runtime_identity = {
        "path": _RUNTIME_IDENTITY_NAME,
        "sha256": sha256_bytes(runtime_identity_bytes),
        "size_bytes": len(runtime_identity_bytes),
    }
    expected_evidence_names = _expected_evidence_names(input_identities)
    _assert_evidence_inventory(
        evidence,
        expected_names=expected_evidence_names,
        require_complete=False,
    )

    expected_plan = _create_plan(
        requests=requests,
        request_identity=request_identity,
        cache_root=cache_root,
        encoder=encoder,
        encoder_id=encoder_id,
        contract=contract,
        batch_size=batch_size,
        rows_per_shard=rows_per_shard,
        created_at_ns=created_at_ns,
        hardware=hardware,
        encoder_device=encoder_device,
        resolved_config=resolved_config_identity,
        encoder_runtime_identity=runtime_identity,
        input_artifacts=input_identities,
    )
    if evidence.exists(_PLAN_NAME):
        plan_payload = _read_json_object(evidence, _PLAN_NAME, label="cache build plan")
        plan = _load_plan(
            plan_payload,
            expected=expected_plan,
        )
    else:
        plan = expected_plan
        _write_once(
            evidence, _PLAN_NAME, _pretty_json_bytes(plan.payload), label="cache build plan"
        )
    # The immutable plan is validated or installed before any caller-provided
    # artifact is staged. A failed invocation therefore cannot seed files that
    # a later, differently configured invocation would accidentally close.
    _write_once(evidence, _REQUEST_COPY_NAME, request_bytes, label="cache build request copy")
    _write_once(
        evidence,
        _RESOLVED_CONFIG_NAME,
        resolved_config_bytes,
        label="cache build resolved config",
    )
    _write_once(
        evidence,
        _RUNTIME_IDENTITY_NAME,
        runtime_identity_bytes,
        label="cache build encoder runtime identity",
    )
    _stage_input_artifacts(evidence, staged_inputs)
    plan_sha256 = evidence.capture(_PLAN_NAME, label="cache build plan").sha256

    state = _load_or_initialize_state(evidence, plan_sha256=plan_sha256)
    completed = _completed_by_id(state, plan=plan)
    adopted_or_changed = False

    # Resume preflight: verify every evidence-owned shard and repair its index
    # rows before resolving shared-cache hits or calling encode_batch.
    for shard in plan.shards:
        absolute_path = cache_root / shard.relative_path
        prior = completed.get(shard.shard_id)
        if prior is None and not os.path.lexists(absolute_path):
            continue
        if prior is not None and not os.path.lexists(absolute_path):
            raise CacheCorruptError(
                "cache build state names a completed shard that is missing",
                details={"shard_id": shard.shard_id, "path": shard.relative_path},
            )
        inspection = inspect_cache_shard(cache_root, shard.relative_path)
        execution_shard = _execution_shard_from_records(shard, inspection.records)
        _assert_inspection_matches_plan(
            inspection,
            shard=execution_shard,
            created_at_ns=created_at_ns,
        )
        if prior is not None:
            _assert_state_identity(
                prior,
                inspection,
                shard=execution_shard,
                plan_shard=shard,
            )
        else:
            completed[shard.shard_id] = _completed_entry(
                execution_shard,
                inspection,
                plan_shard_id=shard.shard_id,
                origin="adopted_after_interruption",
                encoded_rows=0,
                encode_batch_calls=0,
                encode_batch_seconds=None,
            )
            adopted_or_changed = True
        # Re-publishing observed rows is a no-op on the immutable shard and
        # makes a missing/stale SQLite index converge without model work.
        write_shard(
            cache_root,
            encoder_id=plan.namespace,
            contig=shard.contig,
            stride_block=shard.stride_block,
            records=inspection.records,
        )

    if adopted_or_changed or not evidence.exists(_STATE_NAME):
        state = _state_payload(plan_sha256=plan_sha256, completed=completed)
        _atomic_write(evidence, _STATE_NAME, _pretty_json_bytes(state))

    resolved_before = _resolve_plan_cache(cache_root, plan, require_all=False)
    evidence_owned_keys = _completed_row_keys(completed)
    resumed_rows = sum(key in evidence_owned_keys for key in resolved_before.rows)
    if evidence.exists(_CHECKSUMS_NAME):
        return _verify_completed_bundle(
            evidence=evidence,
            evidence_root=evidence_root,
            cache_root=cache_root,
            plan=plan,
            plan_sha256=plan_sha256,
            expected_evidence_names=expected_evidence_names,
        )

    if logger is not None:
        logger.info(
            "data.cache.build.start",
            request_count=len(requests),
            unique_rows=sum(len(shard.rows) for shard in plan.shards),
            planned_shards=len(plan.shards),
            requests_sha256=request_identity["sha256"],
        )

    encoded_rows = 0
    encoded_shards = 0
    for shard in plan.shards:
        missing_rows = tuple(row for row in shard.rows if row.key not in resolved_before.rows)
        if not missing_rows:
            _emit_progress(
                logger,
                shard=shard,
                completed_shards=_resolved_plan_shard_count(plan, resolved_before.rows),
                total_shards=len(plan.shards),
                encoded_rows=encoded_rows,
                resumed_rows=resumed_rows,
                throughput_per_s=None,
                status="resumed",
            )
            continue
        execution_shard = _execution_shard(shard, rows=missing_rows)
        encoded = _encode_shard(
            execution_shard,
            encoder=encoder,
            contract=contract,
            batch_size=batch_size,
            created_at_ns=created_at_ns,
        )
        try:
            path = write_shard(
                cache_root,
                encoder_id=plan.namespace,
                contig=execution_shard.contig,
                stride_block=execution_shard.stride_block,
                records=encoded.records,
            )
        except CacheKeyAlreadyIndexedError as race_exc:
            # A concurrent builder may have published the same logical misses
            # after our preflight but before the serialized write reservation.
            # Accept only the precise key-reservation race. Any path that now
            # exists in this evidence namespace must already be owned by valid
            # durable state; other cache-corruption failures remain fatal.
            _assert_race_planned_path_is_evidence_owned(
                cache_root,
                execution_shard,
                plan_shard=shard,
                completed=completed,
                created_at_ns=created_at_ns,
            )
            _assert_encoded_rows_match_cache(cache_root, encoded.records)
            verified_after_race = _resolve_plan_cache(cache_root, plan, require_all=False)
            if any(record.key not in verified_after_race.rows for record in encoded.records):
                raise CacheCorruptError(
                    "concurrent cache winner disappeared during immediate verification"
                ) from race_exc
            continue
        inspection = inspect_cache_shard(cache_root, shard.relative_path)
        _assert_inspection_matches_plan(
            inspection,
            shard=execution_shard,
            created_at_ns=created_at_ns,
        )
        if tuple(record.embedding for record in inspection.records) != tuple(
            _fp32_vector(record.embedding) for record in encoded.records
        ):
            raise CacheCorruptError(
                "published cache shard embeddings do not match encoded rows",
                details={"shard_id": shard.shard_id, "path": shard.relative_path},
            )
        completed[shard.shard_id] = _completed_entry(
            execution_shard,
            inspection,
            plan_shard_id=shard.shard_id,
            origin="encoded",
            encoded_rows=len(execution_shard.rows),
            encode_batch_calls=encoded.encode_batch_calls,
            encode_batch_seconds=encoded.encode_batch_seconds,
        )
        encoded_rows += len(execution_shard.rows)
        encoded_shards += 1
        state = _state_payload(plan_sha256=plan_sha256, completed=completed)
        _atomic_write(evidence, _STATE_NAME, _pretty_json_bytes(state))
        if logger is not None:
            logger.info(
                "data.shard.write",
                shard_id=shard.shard_id,
                path=path.relative_to(cache_root).as_posix(),
                n_rows=len(execution_shard.rows),
                size_bytes=inspection.size_bytes,
            )
        rate = (
            None
            if encoded.encode_batch_seconds <= 0
            else len(execution_shard.rows) / encoded.encode_batch_seconds
        )
        _emit_progress(
            logger,
            shard=shard,
            completed_shards=_resolved_plan_shard_count(plan, resolved_before.rows)
            + encoded_shards,
            total_shards=len(plan.shards),
            encoded_rows=encoded_rows,
            resumed_rows=resumed_rows,
            throughput_per_s=rate,
            status="encoded",
        )

    resolved = _resolve_plan_cache(cache_root, plan, require_all=True)
    reused_rows = len(resolved.rows) - resumed_rows - encoded_rows
    if reused_rows < 0:
        raise CacheCorruptError("cache build row provenance accounting is inconsistent")
    elapsed_seconds = round(max(time.perf_counter() - started, 0.0), 6)
    completion = _completion_payload(
        encoded_rows=encoded_rows,
        encoded_shards=encoded_shards,
        resumed_rows=resumed_rows,
        reused_rows=reused_rows,
        resolved_unique_rows=len(resolved.rows),
        planned_shards=len(plan.shards),
        invocation_elapsed_seconds=elapsed_seconds,
        run_id=None if logger is None else logger.run_id,
    )
    state = _state_payload(
        plan_sha256=plan_sha256,
        completed=completed,
        completion=completion,
    )
    _atomic_write(evidence, _STATE_NAME, _pretty_json_bytes(state))
    report_payload = _report_payload(
        evidence=evidence,
        plan=plan,
        request_identity=request_identity,
        request_count=len(requests),
        cache_root=cache_root,
        resolved=resolved,
        completed=completed,
        encoded_rows=encoded_rows,
        encoded_shards=encoded_shards,
        resumed_rows=resumed_rows,
        reused_rows=reused_rows,
        elapsed_seconds=elapsed_seconds,
        batch_size=batch_size,
        rows_per_shard=rows_per_shard,
        created_at_ns=created_at_ns,
        hardware=hardware,
        encoder_device=encoder_device,
        resolved_config=resolved_config_identity,
        encoder_runtime_identity=runtime_identity,
        run_id=None if logger is None else logger.run_id,
        input_artifacts=input_identities,
    )
    if logger is not None:
        throughput = cast(Mapping[str, object], report_payload["throughput"])
        logger.info(
            "data.cache.build.end",
            completed_shards=len(plan.shards),
            encoded_rows=encoded_rows,
            resumed_rows=resumed_rows,
            elapsed_s=cast(float, throughput["invocation_elapsed_seconds"]),
            throughput_per_s=throughput["measured_encoded_rows_per_second"],
            evidence_report=report_path.name,
        )
    # Logging is complete before the checksum closure. No builder-owned write
    # occurs after SHA256SUMS is installed and verified.
    _atomic_write(evidence, CACHE_BUILD_REPORT_NAME, _pretty_json_bytes(report_payload))
    _write_checksums(evidence, expected_names=expected_evidence_names)
    _assert_evidence_inventory(
        evidence,
        expected_names=expected_evidence_names,
        require_complete=True,
    )
    _verify_checksums(evidence, expected_names=expected_evidence_names)
    return CacheBuildReport(
        report_path=report_path,
        checksums_path=checksums_path,
        payload=report_payload,
    )

centered_mean ¶

centered_mean(hidden_states: Sequence[Sequence[float]], *, center_token: int, pool_radius: int = DEFAULT_POOL_RADIUS_TOKENS) -> tuple[float, ...]

Mean-pool the inclusive token span center_token ± pool_radius.

Source code in geno_lewm/encoder/pooling.py

def centered_mean(
    hidden_states: Sequence[Sequence[float]],
    *,
    center_token: int,
    pool_radius: int = DEFAULT_POOL_RADIUS_TOKENS,
) -> tuple[float, ...]:
    """Mean-pool the inclusive token span ``center_token ± pool_radius``."""
    rows = _coerce_hidden_states(hidden_states)
    center = _validate_center_token(center_token, len(rows))
    radius = _validate_pool_radius(pool_radius)

    start = max(0, center - radius)
    end = min(len(rows), center + radius + 1)
    return _mean_rows(rows[start:end])

global_mean ¶

global_mean(hidden_states: Sequence[Sequence[float]]) -> tuple[float, ...]

Mean-pool every token vector in hidden_states.

Source code in geno_lewm/encoder/pooling.py

def global_mean(hidden_states: Sequence[Sequence[float]]) -> tuple[float, ...]:
    """Mean-pool every token vector in ``hidden_states``."""
    rows = _coerce_hidden_states(hidden_states)
    return _mean_rows(rows)

pool_hidden_states ¶

pool_hidden_states(hidden_states: Sequence[Sequence[float]], *, edit_locus: int | None = None, center_token: int | None = None, content_token_bounds: tuple[int, int] | None = None, pool_type: Literal['centered_mean', 'global_mean'] = POOL_CENTERED_MEAN, pool_radius: int = DEFAULT_POOL_RADIUS_TOKENS) -> PoolingResult

Pool token-level hidden states into a state vector.

center_token is the actual hidden-state index resolved from the tokenizer's DNA/control-token layout. edit_locus only records whether the state is targeted; this function deliberately does not approximate a token index from base-pair arithmetic. When the locus is absent, the encoder contract requires a global-mean fallback tagged as untargeted.

Source code in geno_lewm/encoder/pooling.py

def pool_hidden_states(
    hidden_states: Sequence[Sequence[float]],
    *,
    edit_locus: int | None = None,
    center_token: int | None = None,
    content_token_bounds: tuple[int, int] | None = None,
    pool_type: Literal["centered_mean", "global_mean"] = POOL_CENTERED_MEAN,
    pool_radius: int = DEFAULT_POOL_RADIUS_TOKENS,
) -> PoolingResult:
    """Pool token-level hidden states into a state vector.

    ``center_token`` is the actual hidden-state index resolved from the
    tokenizer's DNA/control-token layout. ``edit_locus`` only records whether
    the state is targeted; this function deliberately does not approximate a
    token index from base-pair arithmetic. When the locus is absent, the
    encoder contract requires a global-mean fallback tagged as untargeted.
    """
    rows = _coerce_hidden_states(hidden_states)
    requested_type = _validate_pool_type(pool_type)
    radius = _validate_pool_radius(pool_radius)
    if requested_type == POOL_GLOBAL_MEAN and radius != 0:
        raise InputError(
            "global_mean pooling requires pool_radius=0",
            details={"pool_radius": radius},
        )

    if edit_locus is None:
        if center_token is not None:
            raise InputError(
                "center_token must be absent when edit_locus is absent",
                details={"center_token": center_token},
            )
        if content_token_bounds is not None:
            _validate_content_token_bounds(content_token_bounds, token_count=len(rows))
        return PoolingResult(
            vector=_mean_rows(rows),
            pool_type=POOL_GLOBAL_MEAN,
            pool_radius=0,
            untargeted=True,
            center_token=None,
            token_count=len(rows),
        )

    _validate_edit_locus(edit_locus)
    if requested_type == POOL_GLOBAL_MEAN:
        if center_token is not None:
            raise InputError(
                "center_token must be absent for global_mean pooling",
                details={"center_token": center_token},
            )
        return PoolingResult(
            vector=_mean_rows(rows),
            pool_type=POOL_GLOBAL_MEAN,
            pool_radius=0,
            untargeted=False,
            center_token=None,
            token_count=len(rows),
        )

    if center_token is None:
        raise InputError(
            "centered_mean pooling requires a tokenizer-resolved center_token",
            remediation="derive the center from the tokenized <dna>...</dna> layout",
        )
    center = _validate_center_token(center_token, len(rows))
    content_start, content_end = _validate_content_token_bounds(
        content_token_bounds or (0, len(rows)),
        token_count=len(rows),
    )
    if center < content_start or center >= content_end:
        raise InputError(
            "center_token falls outside the DNA content-token bounds",
            details={
                "center_token": center,
                "content_start": content_start,
                "content_end": content_end,
            },
        )
    start = max(content_start, center - radius)
    end = min(content_end, center + radius + 1)
    return PoolingResult(
        vector=_mean_rows(rows[start:end]),
        pool_type=POOL_CENTERED_MEAN,
        pool_radius=radius,
        untargeted=False,
        center_token=center,
        token_count=len(rows),
    )

parse_encoder_runtime_identity_bytes ¶

parse_encoder_runtime_identity_bytes(body: bytes, *, source: str) -> EncoderRuntimeIdentity

Parse one duplicate-key-free, closed runtime identity JSON object.

Source code in geno_lewm/encoder/runtime_identity.py

def parse_encoder_runtime_identity_bytes(
    body: bytes,
    *,
    source: str,
) -> EncoderRuntimeIdentity:
    """Parse one duplicate-key-free, closed runtime identity JSON object."""
    try:
        payload = json.loads(body, object_pairs_hook=_reject_duplicate_keys)
    except (UnicodeDecodeError, json.JSONDecodeError, InputError) as exc:
        raise InputError(
            "encoder runtime identity is invalid JSON",
            details={"source": source, "error": str(exc)},
        ) from exc
    if type(payload) is not dict:
        raise InputError("encoder runtime identity must contain one JSON object")
    observed = frozenset(payload)
    if not _REQUIRED_KEYS.issubset(observed) or not observed.issubset(
        _REQUIRED_KEYS | _OPTIONAL_KEYS
    ):
        raise InputError(
            "encoder runtime identity has an invalid closed schema",
            details={
                "required": sorted(_REQUIRED_KEYS),
                "optional": sorted(_OPTIONAL_KEYS),
                "observed": sorted(observed),
            },
        )
    return EncoderRuntimeIdentity(
        schema_version=payload["schema_version"],
        model_id=payload["model_id"],
        revision=payload["revision"],
        state_contract_version=payload["state_contract_version"],
        runtime_hash=payload["runtime_hash"],
        weights_hash=payload.get("weights_hash"),
    )

canonicalize_dna ¶

canonicalize_dna(sequence: str) -> str

Return uppercase DNA after validating the supported alphabet.

The cache hash invariant is based on uppercased window content, so callers can hash raw source slices and already-canonical windows interchangeably. N is accepted because reference FASTA and edited windows may contain masked bases.

Source code in geno_lewm/encoder/windowing.py

def canonicalize_dna(sequence: str) -> str:
    """Return uppercase DNA after validating the supported alphabet.

    The cache hash invariant is based on uppercased window content, so
    callers can hash raw source slices and already-canonical windows
    interchangeably. ``N`` is accepted because reference FASTA and
    edited windows may contain masked bases.
    """
    if not isinstance(sequence, str):
        raise InputError(
            "DNA sequence must be a string",
            details={"type": type(sequence).__name__},
        )
    canonical = sequence.upper()
    bad = sorted(set(canonical) - _VALID_DNA_BASES)
    if bad:
        raise InputError(
            "DNA sequence contains unsupported base(s)",
            details={"bad_chars": bad},
            remediation="provide only A, C, G, T, or N bases",
        )
    return canonical

extract_window ¶

extract_window(source_sequence: str, *, edit_locus: int | None = None, window_bp: int = DEFAULT_WINDOW_BP, assume_canonical: bool = False) -> ExtractedWindow

Extract a supported-width DNA window from source_sequence.

edit_locus is a 0-based offset in source_sequence. When it is supplied the window is centered on that locus unless clamped by source boundaries. When omitted, the source midpoint is used. If the source is shorter than the requested window or the selected interval extends past the right edge, trailing A bases are appended per Carbon's tokenizer convention.

Set assume_canonical when source_sequence is already uppercase, validated DNA (e.g. a contig from a loaded reference FASTA) to skip the O(len) re-validation. Re-validating a whole chromosome once per variant otherwise dominates VCF scoring wall-clock.

Source code in geno_lewm/encoder/windowing.py

def extract_window(
    source_sequence: str,
    *,
    edit_locus: int | None = None,
    window_bp: int = DEFAULT_WINDOW_BP,
    assume_canonical: bool = False,
) -> ExtractedWindow:
    """Extract a supported-width DNA window from ``source_sequence``.

    ``edit_locus`` is a 0-based offset in ``source_sequence``. When it
    is supplied the window is centered on that locus unless clamped by
    source boundaries. When omitted, the source midpoint is used. If
    the source is shorter than the requested window or the selected
    interval extends past the right edge, trailing ``A`` bases are
    appended per Carbon's tokenizer convention.

    Set ``assume_canonical`` when ``source_sequence`` is already uppercase,
    validated DNA (e.g. a contig from a loaded reference FASTA) to skip the
    O(len) re-validation. Re-validating a whole chromosome once per variant
    otherwise dominates VCF scoring wall-clock.
    """
    _validate_window_bp(window_bp)
    source = source_sequence if assume_canonical else canonicalize_dna(source_sequence)
    if not source:
        raise InputError("source_sequence must be non-empty")

    source_len = len(source)
    center = _center_for(source_len, edit_locus)
    start_bp = _centered_start(source_len, center, window_bp)
    end_bp = start_bp + window_bp

    observed = source[start_bp : min(end_bp, source_len)]
    pad_right_bp = window_bp - len(observed)
    window = observed + (_PAD_BASE * pad_right_bp)

    relative_edit_locus: int | None = None
    if edit_locus is not None:
        relative_edit_locus = edit_locus - start_bp

    return ExtractedWindow(
        sequence=window,
        start_bp=start_bp,
        end_bp=end_bp,
        window_bp=window_bp,
        edit_locus=edit_locus,
        relative_edit_locus=relative_edit_locus,
        pad_right_bp=pad_right_bp,
    )

pad_for_carbon_tokenizer ¶

pad_for_carbon_tokenizer(sequence: str, *, token_bp: int = CARBON_TOKEN_BP) -> str

Right-pad canonical DNA to Carbon's token multiple.

Source code in geno_lewm/encoder/windowing.py

def pad_for_carbon_tokenizer(sequence: str, *, token_bp: int = CARBON_TOKEN_BP) -> str:
    """Right-pad canonical DNA to Carbon's token multiple."""
    if not isinstance(token_bp, int) or isinstance(token_bp, bool) or token_bp <= 0:
        raise InputError(
            "token_bp must be a positive integer",
            details={"token_bp": token_bp, "type": type(token_bp).__name__},
        )
    canonical = canonicalize_dna(sequence)
    remainder = len(canonical) % token_bp
    if remainder == 0:
        return canonical
    return canonical + (_PAD_BASE * (token_bp - remainder))

window_sha256 ¶

window_sha256(sequence: str) -> bytes

Return SHA-256 bytes for the canonicalized DNA sequence.

Source code in geno_lewm/encoder/windowing.py

def window_sha256(sequence: str) -> bytes:
    """Return SHA-256 bytes for the canonicalized DNA sequence."""
    canonical = canonicalize_dna(sequence)
    return hashlib.sha256(canonical.encode("ascii")).digest()

wrap_dna_for_tokenizer ¶

wrap_dna_for_tokenizer(sequence: str) -> str

Return <dna>...</dna> input with Carbon-compatible padding.

Source code in geno_lewm/encoder/windowing.py

def wrap_dna_for_tokenizer(sequence: str) -> str:
    """Return ``<dna>...</dna>`` input with Carbon-compatible padding."""
    padded = pad_for_carbon_tokenizer(sequence)
    return f"{CARBON_DNA_OPEN_TAG}{padded}{CARBON_DNA_CLOSE_TAG}"

geno_lewm.encoder¶

encoder ¶

CacheLookupResult dataclass ¶

CacheProvenance dataclass ¶

CacheReindexReport dataclass ¶

CacheRepairReport dataclass ¶

CacheShardInspection dataclass ¶

WindowCacheKey dataclass ¶

WindowCacheRecord dataclass ¶

key property ¶

with_created_at ¶

CacheBuildReport dataclass ¶

to_dict ¶

CarbonStateEncoder ¶

encoder_hash property ¶

d_state property ¶

parameter_count property ¶

trainable_parameter_count property ¶

encode ¶

encode_batch ¶

pooling_identity ¶

PoolingResult dataclass ¶

d_state property ¶

as_cache_fields ¶

EncoderRuntimeIdentity dataclass ¶

cache_identity_hash property ¶

to_dict ¶

ExtractedWindow dataclass ¶

untargeted property ¶

sha256 property ¶

as_tokenizer_input ¶

default_cache_dir ¶

inspect_cache_shard ¶

read_cache_entries ¶

read_cache_entry ¶

read_embedding ¶

read_embeddings ¶

reindex_cache ¶

repair_cache ¶

shard_path_for ¶

write_shard ¶

build_window_cache ¶

centered_mean ¶

global_mean ¶

pool_hidden_states ¶

parse_encoder_runtime_identity_bytes ¶

canonicalize_dna ¶

extract_window ¶

pad_for_carbon_tokenizer ¶

window_sha256 ¶

wrap_dna_for_tokenizer ¶

`geno_lewm.encoder`¶

CacheLookupResult `dataclass` ¶

CacheProvenance `dataclass` ¶

CacheReindexReport `dataclass` ¶

CacheRepairReport `dataclass` ¶

CacheShardInspection `dataclass` ¶

WindowCacheKey `dataclass` ¶

WindowCacheRecord `dataclass` ¶

key `property` ¶

CacheBuildReport `dataclass` ¶

encoder_hash `property` ¶

d_state `property` ¶

parameter_count `property` ¶

trainable_parameter_count `property` ¶

PoolingResult `dataclass` ¶

d_state `property` ¶

EncoderRuntimeIdentity `dataclass` ¶

cache_identity_hash `property` ¶

ExtractedWindow `dataclass` ¶

untargeted `property` ¶

sha256 `property` ¶