Source code for boltz_data.sequence._parse

from typing import Literal

from ._constants import (
    DNA_COMP_ID_TO_ONE_LETTER,
    DNA_ONE_LETTER_TO_COMP_ID,
    PROTEIN_COMP_ID_TO_ONE_LETTER,
    PROTEIN_ONE_LETTER_TO_COMP_ID,
    RNA_COMP_ID_TO_ONE_LETTER,
    RNA_ONE_LETTER_TO_COMP_ID,
)


class NonstandardSequenceError(ValueError):
    """Exception raised for non-standard residues in a sequence."""


[docs] def sequence_from_residue_names( residue_names: list[str], /, *, polymer_type: Literal["protein", "dna", "rna"], nonstandard_handling: Literal["X", "error", "parentheses"], ) -> str: """ Convert a list of residue names to a sequence string. Args: residue_names: List of residue names. polymer_type: Type of polymer. One of "protein", "dna", or "rna". nonstandard_handling: How to handle non-standard residues. One of: - "X": Replace non-standard residues with 'X'. - "error": Raise an error if a non-standard residue is encountered. - "parentheses": Wrap non-standard residues in parentheses. Returns: A string representing the sequence. """ comp_id_to_char = _get_comp_id_to_char_mapping_for_polymer_type(polymer_type) sequence: list[str] = [] for residue_name in residue_names: try: sequence.append(comp_id_to_char[residue_name]) except KeyError: match nonstandard_handling: case "error": msg = f"Non-standard residue encountered: {residue_name}" raise NonstandardSequenceError(msg) from None case "parentheses": sequence.append(f"({residue_name})") case "X": sequence.append("X") case _: msg = f"Invalid nonstandard handling: {nonstandard_handling}" raise ValueError(msg) from None return "".join(sequence)
[docs] def residue_names_from_sequence( sequence: str, /, *, polymer_type: Literal["protein", "dna", "rna"], ) -> list[str]: """ Convert a sequence string to a list of residue names. Args: sequence: The sequence string. polymer_type: Type of polymer. One of "protein", "dna", or "rna". Returns: A list of residue names corresponding to the sequence. """ char_to_comp_id = _get_char_to_comp_id_mapping_for_polymer_type(polymer_type) residue_names: list[str] = [] i = 0 while i < len(sequence): char = sequence[i] if char == "(": end_idx = sequence.index(")", i) residue_name = sequence[i + 1 : end_idx] residue_names.append(residue_name) i = end_idx + 1 else: try: residue_names.append(char_to_comp_id[char]) except KeyError: msg = f"Non-standard character encountered in sequence: {char}" raise NonstandardSequenceError(msg) from None i += 1 return residue_names
def _get_comp_id_to_char_mapping_for_polymer_type(polymer_type: Literal["protein", "dna", "rna"], /) -> dict[str, str]: if polymer_type == "protein": return PROTEIN_COMP_ID_TO_ONE_LETTER if polymer_type == "dna": return DNA_COMP_ID_TO_ONE_LETTER if polymer_type == "rna": return RNA_COMP_ID_TO_ONE_LETTER msg = f"Invalid polymer type: {polymer_type}" raise ValueError(msg) def _get_char_to_comp_id_mapping_for_polymer_type(polymer_type: Literal["protein", "dna", "rna"], /) -> dict[str, str]: if polymer_type == "protein": return PROTEIN_ONE_LETTER_TO_COMP_ID if polymer_type == "dna": return DNA_ONE_LETTER_TO_COMP_ID if polymer_type == "rna": return RNA_ONE_LETTER_TO_COMP_ID msg = f"Invalid polymer type: {polymer_type}" raise ValueError(msg)