Source code for boltz_data.sequence._parse

from typing import Literal

from ._constants import (
    DNA_COMP_ID_TO_ONE_LETTER,
    DNA_ONE_LETTER_TO_COMP_ID,
    PROTEIN_COMP_ID_TO_ONE_LETTER,
    PROTEIN_ONE_LETTER_TO_COMP_ID,
    RNA_COMP_ID_TO_ONE_LETTER,
    RNA_ONE_LETTER_TO_COMP_ID,
)


class NonstandardSequenceError(ValueError):
    """Exception raised for non-standard residues in a sequence."""



[docs]
def sequence_from_residue_names(
    residue_names: list[str],
    /,
    *,
    polymer_type: Literal["protein", "dna", "rna"],
    nonstandard_handling: Literal["X", "error", "parentheses"],
) -> str:
    """
    Convert a list of residue names to a sequence string.

    Args:
        residue_names: List of residue names.
        polymer_type: Type of polymer. One of "protein", "dna", or "rna".
        nonstandard_handling: How to handle non-standard residues. One of:
            - "X": Replace non-standard residues with 'X'.
            - "error": Raise an error if a non-standard residue is encountered.
            - "parentheses": Wrap non-standard residues in parentheses.

    Returns:
        A string representing the sequence.

    """
    comp_id_to_char = _get_comp_id_to_char_mapping_for_polymer_type(polymer_type)
    sequence: list[str] = []
    for residue_name in residue_names:
        try:
            sequence.append(comp_id_to_char[residue_name])
        except KeyError:
            match nonstandard_handling:
                case "error":
                    msg = f"Non-standard residue encountered: {residue_name}"
                    raise NonstandardSequenceError(msg) from None
                case "parentheses":
                    sequence.append(f"({residue_name})")
                case "X":
                    sequence.append("X")
                case _:
                    msg = f"Invalid nonstandard handling: {nonstandard_handling}"
                    raise ValueError(msg) from None
    return "".join(sequence)




[docs]
def residue_names_from_sequence(
    sequence: str,
    /,
    *,
    polymer_type: Literal["protein", "dna", "rna"],
) -> list[str]:
    """
    Convert a sequence string to a list of residue names.

    Args:
        sequence: The sequence string.
        polymer_type: Type of polymer. One of "protein", "dna", or "rna".

    Returns:
        A list of residue names corresponding to the sequence.

    """
    char_to_comp_id = _get_char_to_comp_id_mapping_for_polymer_type(polymer_type)
    residue_names: list[str] = []
    i = 0
    while i < len(sequence):
        char = sequence[i]
        if char == "(":
            end_idx = sequence.index(")", i)
            residue_name = sequence[i + 1 : end_idx]
            residue_names.append(residue_name)
            i = end_idx + 1
        else:
            try:
                residue_names.append(char_to_comp_id[char])
            except KeyError:
                msg = f"Non-standard character encountered in sequence: {char}"
                raise NonstandardSequenceError(msg) from None
            i += 1
    return residue_names



def _get_comp_id_to_char_mapping_for_polymer_type(polymer_type: Literal["protein", "dna", "rna"], /) -> dict[str, str]:
    if polymer_type == "protein":
        return PROTEIN_COMP_ID_TO_ONE_LETTER
    if polymer_type == "dna":
        return DNA_COMP_ID_TO_ONE_LETTER
    if polymer_type == "rna":
        return RNA_COMP_ID_TO_ONE_LETTER
    msg = f"Invalid polymer type: {polymer_type}"
    raise ValueError(msg)


def _get_char_to_comp_id_mapping_for_polymer_type(polymer_type: Literal["protein", "dna", "rna"], /) -> dict[str, str]:
    if polymer_type == "protein":
        return PROTEIN_ONE_LETTER_TO_COMP_ID
    if polymer_type == "dna":
        return DNA_ONE_LETTER_TO_COMP_ID
    if polymer_type == "rna":
        return RNA_ONE_LETTER_TO_COMP_ID
    msg = f"Invalid polymer type: {polymer_type}"
    raise ValueError(msg)