`geno_lewm.encoder.pooling`¶

pooling ¶

Pooling strategies for Carbon hidden states.

Defined by encoder contract This module is intentionally independent of torch so the pooling contract, cache metadata, and downstream schema behavior can be validated before the Carbon runtime wrapper lands.

PoolingResult `dataclass` ¶

PoolingResult(vector: tuple[float, ...], pool_type: Literal['centered_mean', 'global_mean'], pool_radius: int, untargeted: bool, center_token: int | None, token_count: int)

Pooled state vector plus cache-key metadata.

d_state `property` ¶

d_state: int

Return the pooled vector width.

as_cache_fields ¶

as_cache_fields() -> Mapping[str, object]

Return fields shared with the window-cache schema.

Source code in geno_lewm/encoder/pooling.py

def as_cache_fields(self) -> Mapping[str, object]:
    """Return fields shared with the window-cache schema."""
    return {
        "pool_type": self.pool_type,
        "pool_radius": self.pool_radius,
        "untargeted": self.untargeted,
    }

global_mean ¶

global_mean(hidden_states: Sequence[Sequence[float]]) -> tuple[float, ...]

Mean-pool every token vector in hidden_states.

Source code in geno_lewm/encoder/pooling.py

def global_mean(hidden_states: Sequence[Sequence[float]]) -> tuple[float, ...]:
    """Mean-pool every token vector in ``hidden_states``."""
    rows = _coerce_hidden_states(hidden_states)
    return _mean_rows(rows)

centered_mean ¶

centered_mean(hidden_states: Sequence[Sequence[float]], *, center_token: int, pool_radius: int = DEFAULT_POOL_RADIUS_TOKENS) -> tuple[float, ...]

Mean-pool the inclusive token span center_token ± pool_radius.

Source code in geno_lewm/encoder/pooling.py

def centered_mean(
    hidden_states: Sequence[Sequence[float]],
    *,
    center_token: int,
    pool_radius: int = DEFAULT_POOL_RADIUS_TOKENS,
) -> tuple[float, ...]:
    """Mean-pool the inclusive token span ``center_token ± pool_radius``."""
    rows = _coerce_hidden_states(hidden_states)
    center = _validate_center_token(center_token, len(rows))
    radius = _validate_pool_radius(pool_radius)

    start = max(0, center - radius)
    end = min(len(rows), center + radius + 1)
    return _mean_rows(rows[start:end])

pool_hidden_states ¶

pool_hidden_states(hidden_states: Sequence[Sequence[float]], *, edit_locus: int | None = None, center_token: int | None = None, content_token_bounds: tuple[int, int] | None = None, pool_type: Literal['centered_mean', 'global_mean'] = POOL_CENTERED_MEAN, pool_radius: int = DEFAULT_POOL_RADIUS_TOKENS) -> PoolingResult

Pool token-level hidden states into a state vector.

center_token is the actual hidden-state index resolved from the tokenizer's DNA/control-token layout. edit_locus only records whether the state is targeted; this function deliberately does not approximate a token index from base-pair arithmetic. When the locus is absent, the encoder contract requires a global-mean fallback tagged as untargeted.

Source code in geno_lewm/encoder/pooling.py

def pool_hidden_states(
    hidden_states: Sequence[Sequence[float]],
    *,
    edit_locus: int | None = None,
    center_token: int | None = None,
    content_token_bounds: tuple[int, int] | None = None,
    pool_type: Literal["centered_mean", "global_mean"] = POOL_CENTERED_MEAN,
    pool_radius: int = DEFAULT_POOL_RADIUS_TOKENS,
) -> PoolingResult:
    """Pool token-level hidden states into a state vector.

    ``center_token`` is the actual hidden-state index resolved from the
    tokenizer's DNA/control-token layout. ``edit_locus`` only records whether
    the state is targeted; this function deliberately does not approximate a
    token index from base-pair arithmetic. When the locus is absent, the
    encoder contract requires a global-mean fallback tagged as untargeted.
    """
    rows = _coerce_hidden_states(hidden_states)
    requested_type = _validate_pool_type(pool_type)
    radius = _validate_pool_radius(pool_radius)
    if requested_type == POOL_GLOBAL_MEAN and radius != 0:
        raise InputError(
            "global_mean pooling requires pool_radius=0",
            details={"pool_radius": radius},
        )

    if edit_locus is None:
        if center_token is not None:
            raise InputError(
                "center_token must be absent when edit_locus is absent",
                details={"center_token": center_token},
            )
        if content_token_bounds is not None:
            _validate_content_token_bounds(content_token_bounds, token_count=len(rows))
        return PoolingResult(
            vector=_mean_rows(rows),
            pool_type=POOL_GLOBAL_MEAN,
            pool_radius=0,
            untargeted=True,
            center_token=None,
            token_count=len(rows),
        )

    _validate_edit_locus(edit_locus)
    if requested_type == POOL_GLOBAL_MEAN:
        if center_token is not None:
            raise InputError(
                "center_token must be absent for global_mean pooling",
                details={"center_token": center_token},
            )
        return PoolingResult(
            vector=_mean_rows(rows),
            pool_type=POOL_GLOBAL_MEAN,
            pool_radius=0,
            untargeted=False,
            center_token=None,
            token_count=len(rows),
        )

    if center_token is None:
        raise InputError(
            "centered_mean pooling requires a tokenizer-resolved center_token",
            remediation="derive the center from the tokenized <dna>...</dna> layout",
        )
    center = _validate_center_token(center_token, len(rows))
    content_start, content_end = _validate_content_token_bounds(
        content_token_bounds or (0, len(rows)),
        token_count=len(rows),
    )
    if center < content_start or center >= content_end:
        raise InputError(
            "center_token falls outside the DNA content-token bounds",
            details={
                "center_token": center,
                "content_start": content_start,
                "content_end": content_end,
            },
        )
    start = max(content_start, center - radius)
    end = min(content_end, center + radius + 1)
    return PoolingResult(
        vector=_mean_rows(rows[start:end]),
        pool_type=POOL_CENTERED_MEAN,
        pool_radius=radius,
        untargeted=False,
        center_token=center,
        token_count=len(rows),
    )

geno_lewm.encoder.pooling¶

pooling ¶

PoolingResult dataclass ¶

d_state property ¶

as_cache_fields ¶

global_mean ¶

centered_mean ¶

pool_hidden_states ¶

`geno_lewm.encoder.pooling`¶

PoolingResult `dataclass` ¶

d_state `property` ¶