Source code for boltz_data.sequence._parse
from typing import Literal
from ._constants import (
DNA_COMP_ID_TO_ONE_LETTER,
DNA_ONE_LETTER_TO_COMP_ID,
PROTEIN_COMP_ID_TO_ONE_LETTER,
PROTEIN_ONE_LETTER_TO_COMP_ID,
RNA_COMP_ID_TO_ONE_LETTER,
RNA_ONE_LETTER_TO_COMP_ID,
)
class NonstandardSequenceError(ValueError):
"""Exception raised for non-standard residues in a sequence."""
[docs]
def sequence_from_residue_names(
residue_names: list[str],
/,
*,
polymer_type: Literal["protein", "dna", "rna"],
nonstandard_handling: Literal["X", "error", "parentheses"],
) -> str:
"""
Convert a list of residue names to a sequence string.
Args:
residue_names: List of residue names.
polymer_type: Type of polymer. One of "protein", "dna", or "rna".
nonstandard_handling: How to handle non-standard residues. One of:
- "X": Replace non-standard residues with 'X'.
- "error": Raise an error if a non-standard residue is encountered.
- "parentheses": Wrap non-standard residues in parentheses.
Returns:
A string representing the sequence.
"""
comp_id_to_char = _get_comp_id_to_char_mapping_for_polymer_type(polymer_type)
sequence: list[str] = []
for residue_name in residue_names:
try:
sequence.append(comp_id_to_char[residue_name])
except KeyError:
match nonstandard_handling:
case "error":
msg = f"Non-standard residue encountered: {residue_name}"
raise NonstandardSequenceError(msg) from None
case "parentheses":
sequence.append(f"({residue_name})")
case "X":
sequence.append("X")
case _:
msg = f"Invalid nonstandard handling: {nonstandard_handling}"
raise ValueError(msg) from None
return "".join(sequence)
[docs]
def residue_names_from_sequence(
sequence: str,
/,
*,
polymer_type: Literal["protein", "dna", "rna"],
) -> list[str]:
"""
Convert a sequence string to a list of residue names.
Args:
sequence: The sequence string.
polymer_type: Type of polymer. One of "protein", "dna", or "rna".
Returns:
A list of residue names corresponding to the sequence.
"""
char_to_comp_id = _get_char_to_comp_id_mapping_for_polymer_type(polymer_type)
residue_names: list[str] = []
i = 0
while i < len(sequence):
char = sequence[i]
if char == "(":
end_idx = sequence.index(")", i)
residue_name = sequence[i + 1 : end_idx]
residue_names.append(residue_name)
i = end_idx + 1
else:
try:
residue_names.append(char_to_comp_id[char])
except KeyError:
msg = f"Non-standard character encountered in sequence: {char}"
raise NonstandardSequenceError(msg) from None
i += 1
return residue_names
def _get_comp_id_to_char_mapping_for_polymer_type(polymer_type: Literal["protein", "dna", "rna"], /) -> dict[str, str]:
if polymer_type == "protein":
return PROTEIN_COMP_ID_TO_ONE_LETTER
if polymer_type == "dna":
return DNA_COMP_ID_TO_ONE_LETTER
if polymer_type == "rna":
return RNA_COMP_ID_TO_ONE_LETTER
msg = f"Invalid polymer type: {polymer_type}"
raise ValueError(msg)
def _get_char_to_comp_id_mapping_for_polymer_type(polymer_type: Literal["protein", "dna", "rna"], /) -> dict[str, str]:
if polymer_type == "protein":
return PROTEIN_ONE_LETTER_TO_COMP_ID
if polymer_type == "dna":
return DNA_ONE_LETTER_TO_COMP_ID
if polymer_type == "rna":
return RNA_ONE_LETTER_TO_COMP_ID
msg = f"Invalid polymer type: {polymer_type}"
raise ValueError(msg)