Source code for boltz_data.cif._from_structure

import gemmi
import numpy as np

from boltz_data.definition import StructureDefinition
from boltz_data.sequence._parse import residue_names_from_sequence
from boltz_data.utils import pivot

BOLTZ_POLYMER_TO_MMCIF_POLYMER = {
    "protein": "polypeptide(L)",
    "rna": "polyribonucleotide",
    "dna": "polydeoxyribonucleotide",
}



[docs]
def mmcif_from_structure(structure: StructureDefinition, /, *, name: str = "pred") -> gemmi.cif.Block:
    """
    Convert a StructureDefinition to an mmCIF block.

    Creates mmCIF categories for entities, polymers, non-polymers, branched polymers,
    and their corresponding chains and numbering schemes.

    Args:
        structure: The structure definition to convert.
        name: Name for the mmCIF data block. Defaults to "pred".

    Returns:
        An mmCIF block containing the structure information.

    """
    block = gemmi.cif.Block(name)

    block.set_mmcif_category(
        "_entity",
        pivot(
            [
                {
                    "id": str(entity_idx + 1),
                    "type": "non-polymer" if definition.type == "ligand_ccd" else "polymer",
                    "pdbx_description": definition.description,
                }
                for entity_idx, definition in enumerate(structure.entities)
            ]
        ),
    )

    block.set_mmcif_category(
        "_entity_poly",
        pivot(
            [
                {
                    "entity_id": str(entity_idx + 1),
                    "type": BOLTZ_POLYMER_TO_MMCIF_POLYMER[definition.type],
                }
                for entity_idx, definition in enumerate(structure.entities)
                if definition.type in {"protein", "rna", "dna"}
            ]
        ),
    )

    block.set_mmcif_category(
        "_entity_poly_seq",
        pivot(
            [
                {"entity_id": str(entity_idx + 1), "num": str(res_idx + 1), "mon_id": residue_name, "hetero": "n"}
                for entity_idx, definition in enumerate(structure.entities)
                if definition.type == "protein" or definition.type == "rna" or definition.type == "dna"
                for res_idx, residue_name in enumerate(
                    residue_names_from_sequence(definition.sequence, polymer_type=definition.type)
                )
            ]
        ),
    )

    block.set_mmcif_category(
        "_pdbx_poly_seq_scheme",
        pivot(
            [
                {
                    "asym_id": chain_id,
                    "entity_id": chain.entity_idx + 1,
                    "seq_id": residue_idx + 1,
                    "pdb_seq_num": residue_number,
                }
                for chain_id, chain in structure.chains.items()
                if (entity := structure.entities[chain.entity_idx]).type == "protein"
                or entity.type == "rna"
                or entity.type == "dna"
                for residue_idx, residue_number in enumerate(
                    chain.residue_numbers
                    if chain.residue_numbers is not None
                    else np.arange(1, len(entity.sequence) + 1)
                )
            ]
        ),
    )

    block.set_mmcif_category(
        "_pdbx_entity_nonpoly",
        pivot(
            [
                {"entity_id": str(entity_idx + 1), "comp_id": definition.comp_id}
                for entity_idx, definition in enumerate(structure.entities)
                if definition.type == "ligand_ccd"
            ]
        ),
    )

    block.set_mmcif_category(
        "_pdbx_nonpoly_scheme",
        pivot(
            [
                {
                    "asym_id": chain_id,
                    "entity_id": chain.entity_idx + 1,
                    "mon_id": entity.comp_id,
                    "pdb_seq_num": chain.residue_numbers[0] if chain.residue_numbers is not None else 1,
                }
                for chain_id, chain in structure.chains.items()
                if (entity := structure.entities[chain.entity_idx]).type == "ligand_ccd"
            ]
        ),
    )

    block.set_mmcif_category(
        "_pdbx_entity_branch_list",
        pivot(
            [
                {"entity_id": str(entity_idx + 1), "num": res_idx + 1, "comp_id": residue_name}
                for entity_idx, definition in enumerate(structure.entities)
                if definition.type == "branched_polymer"
                for res_idx, residue_name in enumerate(definition.comp_ids)
            ]
        ),
    )

    block.set_mmcif_category(
        "_pdbx_branch_scheme",
        pivot(
            [
                {"asym_id": chain_id, "entity_id": chain.entity_idx + 1, "num": res_idx + 1, "pdb_seq_num": resnum}
                for chain_id, chain in structure.chains.items()
                if (entity := structure.entities[chain.entity_idx]).type == "branched_polymer"
                for res_idx, resnum in enumerate(
                    chain.residue_numbers
                    if chain.residue_numbers is not None
                    else np.arange(1, len(entity.comp_ids) + 1)
                )
            ]
        ),
    )

    block.set_mmcif_category(
        "_struct_asym",
        pivot([{"id": chain_id, "entity_id": chain.entity_idx + 1} for chain_id, chain in structure.chains.items()]),
    )

    return block