Source code for boltz_data.cif._from_structure

import gemmi
import numpy as np

from boltz_data.definition import StructureDefinition
from boltz_data.sequence._parse import residue_names_from_sequence
from boltz_data.utils import pivot

BOLTZ_POLYMER_TO_MMCIF_POLYMER = {
    "protein": "polypeptide(L)",
    "rna": "polyribonucleotide",
    "dna": "polydeoxyribonucleotide",
}


[docs] def mmcif_from_structure(structure: StructureDefinition, /, *, name: str = "pred") -> gemmi.cif.Block: """ Convert a StructureDefinition to an mmCIF block. Creates mmCIF categories for entities, polymers, non-polymers, branched polymers, and their corresponding chains and numbering schemes. Args: structure: The structure definition to convert. name: Name for the mmCIF data block. Defaults to "pred". Returns: An mmCIF block containing the structure information. """ block = gemmi.cif.Block(name) block.set_mmcif_category( "_entity", pivot( [ { "id": str(entity_idx + 1), "type": "non-polymer" if definition.type == "ligand_ccd" else "polymer", "pdbx_description": definition.description, } for entity_idx, definition in enumerate(structure.entities) ] ), ) block.set_mmcif_category( "_entity_poly", pivot( [ { "entity_id": str(entity_idx + 1), "type": BOLTZ_POLYMER_TO_MMCIF_POLYMER[definition.type], } for entity_idx, definition in enumerate(structure.entities) if definition.type in {"protein", "rna", "dna"} ] ), ) block.set_mmcif_category( "_entity_poly_seq", pivot( [ {"entity_id": str(entity_idx + 1), "num": str(res_idx + 1), "mon_id": residue_name, "hetero": "n"} for entity_idx, definition in enumerate(structure.entities) if definition.type == "protein" or definition.type == "rna" or definition.type == "dna" for res_idx, residue_name in enumerate( residue_names_from_sequence(definition.sequence, polymer_type=definition.type) ) ] ), ) block.set_mmcif_category( "_pdbx_poly_seq_scheme", pivot( [ { "asym_id": chain_id, "entity_id": chain.entity_idx + 1, "seq_id": residue_idx + 1, "pdb_seq_num": residue_number, } for chain_id, chain in structure.chains.items() if (entity := structure.entities[chain.entity_idx]).type == "protein" or entity.type == "rna" or entity.type == "dna" for residue_idx, residue_number in enumerate( chain.residue_numbers if chain.residue_numbers is not None else np.arange(1, len(entity.sequence) + 1) ) ] ), ) block.set_mmcif_category( "_pdbx_entity_nonpoly", pivot( [ {"entity_id": str(entity_idx + 1), "comp_id": definition.comp_id} for entity_idx, definition in enumerate(structure.entities) if definition.type == "ligand_ccd" ] ), ) block.set_mmcif_category( "_pdbx_nonpoly_scheme", pivot( [ { "asym_id": chain_id, "entity_id": chain.entity_idx + 1, "mon_id": entity.comp_id, "pdb_seq_num": chain.residue_numbers[0] if chain.residue_numbers is not None else 1, } for chain_id, chain in structure.chains.items() if (entity := structure.entities[chain.entity_idx]).type == "ligand_ccd" ] ), ) block.set_mmcif_category( "_pdbx_entity_branch_list", pivot( [ {"entity_id": str(entity_idx + 1), "num": res_idx + 1, "comp_id": residue_name} for entity_idx, definition in enumerate(structure.entities) if definition.type == "branched_polymer" for res_idx, residue_name in enumerate(definition.comp_ids) ] ), ) block.set_mmcif_category( "_pdbx_branch_scheme", pivot( [ {"asym_id": chain_id, "entity_id": chain.entity_idx + 1, "num": res_idx + 1, "pdb_seq_num": resnum} for chain_id, chain in structure.chains.items() if (entity := structure.entities[chain.entity_idx]).type == "branched_polymer" for res_idx, resnum in enumerate( chain.residue_numbers if chain.residue_numbers is not None else np.arange(1, len(entity.comp_ids) + 1) ) ] ), ) block.set_mmcif_category( "_struct_asym", pivot([{"id": chain_id, "entity_id": chain.entity_idx + 1} for chain_id, chain in structure.chains.items()]), ) return block