Skip to content

geno_lewm.surprise.context

context

Context stratification labels for RFC-0009 calibration buckets.

REGION_CLASSES module-attribute

REGION_CLASSES: tuple[str, ...] = ('coding_synonymous', 'coding_missense', 'coding_nonsense', 'splice', 'utr5', 'utr3', 'intron', 'promoter', 'enhancer', 'intergenic', 'other')

Canonical RFC-0009 region_class values.

GC_BINS module-attribute

GC_BINS: tuple[str, ...] = ('low', 'mid', 'high')

Canonical RFC-0009 gc_bin values.

REPEAT_CLASSES module-attribute

REPEAT_CLASSES: tuple[str, ...] = ('none', 'simple', 'low_complexity', 'transposon', 'segmental_dup')

Canonical RFC-0009 repeat_class values.

UNKNOWN_BUCKET_ID module-attribute

UNKNOWN_BUCKET_ID = '*'

Catch-all calibration bucket reached after every parent bucket is sparse.

DEFAULT_GC_LOW_CUTOFF module-attribute

DEFAULT_GC_LOW_CUTOFF: float = 1.0 / 3.0

Inclusive lower-tercile GC cutoff used when no fitted cutpoints are supplied.

DEFAULT_GC_HIGH_CUTOFF module-attribute

DEFAULT_GC_HIGH_CUTOFF: float = 2.0 / 3.0

Inclusive upper-tercile GC cutoff used when no fitted cutpoints are supplied.

DEFAULT_MIN_BUCKET_SIZE module-attribute

DEFAULT_MIN_BUCKET_SIZE = 1000

RFC-0009 default threshold for a well-populated calibration bucket.

ContextLabel dataclass

ContextLabel(region_class: str, gc_bin: str, repeat_class: str)

Canonical RFC-0009 context tuple for a single variant locus.

bucket_id property

bucket_id: str

Return {region_class}|{gc_bin}|{repeat_class}.

as_tuple

as_tuple() -> tuple[str, str, str]

Return the canonical (region_class, gc_bin, repeat_class) tuple.

Source code in geno_lewm/surprise/context.py
def as_tuple(self) -> tuple[str, str, str]:
    """Return the canonical ``(region_class, gc_bin, repeat_class)`` tuple."""
    return (self.region_class, self.gc_bin, self.repeat_class)

backoff_chain

backoff_chain() -> tuple[str, ...]

Return bucket IDs from most specific to catch-all.

Source code in geno_lewm/surprise/context.py
def backoff_chain(self) -> tuple[str, ...]:
    """Return bucket IDs from most specific to catch-all."""
    return backoff_chain(self)

classify_context

classify_context(*, region: str | Sequence[str] | None, gc_window: str, repeat: str | Sequence[str] | None = None, low_gc_cutoff: float = DEFAULT_GC_LOW_CUTOFF, high_gc_cutoff: float = DEFAULT_GC_HIGH_CUTOFF) -> ContextLabel

Build a canonical context label from annotation terms and a DNA window.

region and repeat accept upstream annotation labels such as VEP/SnpEff consequences or repeat-masker class strings. gc_window is the sequence window around the variant locus.

Source code in geno_lewm/surprise/context.py
def classify_context(
    *,
    region: str | Sequence[str] | None,
    gc_window: str,
    repeat: str | Sequence[str] | None = None,
    low_gc_cutoff: float = DEFAULT_GC_LOW_CUTOFF,
    high_gc_cutoff: float = DEFAULT_GC_HIGH_CUTOFF,
) -> ContextLabel:
    """Build a canonical context label from annotation terms and a DNA window.

    ``region`` and ``repeat`` accept upstream annotation labels such as
    VEP/SnpEff consequences or repeat-masker class strings. ``gc_window``
    is the sequence window around the variant locus.
    """
    return ContextLabel(
        region_class=classify_region(region),
        gc_bin=classify_gc_bin(
            gc_window,
            low_cutoff=low_gc_cutoff,
            high_cutoff=high_gc_cutoff,
        ),
        repeat_class=classify_repeat(repeat),
    )

classify_region

classify_region(annotation: str | Sequence[str] | None) -> str

Return the canonical region_class for annotation term(s).

Source code in geno_lewm/surprise/context.py
def classify_region(annotation: str | Sequence[str] | None) -> str:
    """Return the canonical ``region_class`` for annotation term(s)."""
    terms = _annotation_terms(annotation, field="region")
    if not terms:
        return "other"

    for region_class, aliases in _REGION_ALIAS_GROUPS:
        if any(term == region_class or term in aliases for term in terms):
            return region_class
    return "other"

classify_repeat

classify_repeat(annotation: str | Sequence[str] | None) -> str

Return the canonical repeat_class for repeat annotation term(s).

Source code in geno_lewm/surprise/context.py
def classify_repeat(annotation: str | Sequence[str] | None) -> str:
    """Return the canonical ``repeat_class`` for repeat annotation term(s)."""
    terms = _annotation_terms(annotation, field="repeat")
    if not terms:
        return "none"

    for repeat_class, aliases in _REPEAT_ALIAS_GROUPS:
        if any(term == repeat_class or term in aliases for term in terms):
            return repeat_class

    raise InputError(
        "repeat annotation does not map to a known repeat_class",
        details={"annotation": list(terms), "allowed": list(REPEAT_CLASSES)},
        remediation="normalize the repeat track to none/simple/low_complexity/transposon/segmental_dup",
    )

gc_fraction

gc_fraction(sequence: str) -> float

Return GC fraction over called A/C/G/T bases in sequence.

N bases are valid in reference windows but are excluded from the denominator because their GC status is unknown. A window containing no called bases is rejected.

Source code in geno_lewm/surprise/context.py
def gc_fraction(sequence: str) -> float:
    """Return GC fraction over called A/C/G/T bases in ``sequence``.

    ``N`` bases are valid in reference windows but are excluded from the
    denominator because their GC status is unknown. A window containing
    no called bases is rejected.
    """
    canonical = canonicalize_dna(sequence)
    called_count = sum(base in _CALLED_BASES for base in canonical)
    if called_count == 0:
        raise InputError(
            "GC window contains no called A/C/G/T bases",
            details={"length": len(canonical)},
        )
    gc_count = sum(base in _GC_BASES for base in canonical)
    return gc_count / called_count

classify_gc_bin

classify_gc_bin(sequence: str, *, low_cutoff: float = DEFAULT_GC_LOW_CUTOFF, high_cutoff: float = DEFAULT_GC_HIGH_CUTOFF) -> str

Return low, mid, or high for a DNA window's GC fraction.

Source code in geno_lewm/surprise/context.py
def classify_gc_bin(
    sequence: str,
    *,
    low_cutoff: float = DEFAULT_GC_LOW_CUTOFF,
    high_cutoff: float = DEFAULT_GC_HIGH_CUTOFF,
) -> str:
    """Return ``low``, ``mid``, or ``high`` for a DNA window's GC fraction."""
    low = _validate_cutoff("low_cutoff", low_cutoff)
    high = _validate_cutoff("high_cutoff", high_cutoff)
    if low >= high:
        raise InputError(
            "low_cutoff must be less than high_cutoff",
            details={"low_cutoff": low, "high_cutoff": high},
        )

    fraction = gc_fraction(sequence)
    if fraction <= low:
        return "low"
    if fraction >= high:
        return "high"
    return "mid"

make_bucket_id

make_bucket_id(region_class: str, gc_bin: str, repeat_class: str) -> str

Return the stable full calibration bucket ID for a context tuple.

Source code in geno_lewm/surprise/context.py
def make_bucket_id(region_class: str, gc_bin: str, repeat_class: str) -> str:
    """Return the stable full calibration bucket ID for a context tuple."""
    return _join_bucket_parts(
        (
            _require_member("region_class", region_class, REGION_CLASSES),
            _require_member("gc_bin", gc_bin, GC_BINS),
            _require_member("repeat_class", repeat_class, REPEAT_CLASSES),
        )
    )

backoff_chain

backoff_chain(label_or_bucket: ContextLabel | str) -> tuple[str, ...]

Return fixed parent-bucket IDs ending in *.

Full buckets back off as region|gc|repeat -> region|gc -> region -> *. Parent buckets can also be passed directly.

Source code in geno_lewm/surprise/context.py
def backoff_chain(label_or_bucket: ContextLabel | str) -> tuple[str, ...]:
    """Return fixed parent-bucket IDs ending in ``*``.

    Full buckets back off as ``region|gc|repeat`` -> ``region|gc`` ->
    ``region`` -> ``*``. Parent buckets can also be passed directly.
    """
    parts = _bucket_parts(label_or_bucket)
    if not parts:
        return (UNKNOWN_BUCKET_ID,)
    parents = tuple(_join_bucket_parts(parts[:i]) for i in range(len(parts), 0, -1))
    return (*parents, UNKNOWN_BUCKET_ID)

select_backoff_bucket

select_backoff_bucket(label_or_bucket: ContextLabel | str, bucket_sizes: Mapping[str, int], *, min_count: int = DEFAULT_MIN_BUCKET_SIZE) -> str

Return the first bucket in the backoff chain with enough calibration rows.

If every specific parent is sparse, the catch-all * bucket is returned. Downstream calibration code can still report low confidence based on that bucket's own count.

Source code in geno_lewm/surprise/context.py
def select_backoff_bucket(
    label_or_bucket: ContextLabel | str,
    bucket_sizes: Mapping[str, int],
    *,
    min_count: int = DEFAULT_MIN_BUCKET_SIZE,
) -> str:
    """Return the first bucket in the backoff chain with enough calibration rows.

    If every specific parent is sparse, the catch-all ``*`` bucket is
    returned. Downstream calibration code can still report low
    confidence based on that bucket's own count.
    """
    threshold = _validate_positive_int("min_count", min_count)
    counts = _validate_bucket_sizes(bucket_sizes)
    chain = backoff_chain(label_or_bucket)
    for bucket_id in chain[:-1]:
        if counts.get(bucket_id, 0) >= threshold:
            return bucket_id
    return chain[-1]