import gemmi
import numpy as np
from boltz_data.definition import StructureDefinition
from boltz_data.sequence._parse import residue_names_from_sequence
from boltz_data.utils import pivot
BOLTZ_POLYMER_TO_MMCIF_POLYMER = {
"protein": "polypeptide(L)",
"rna": "polyribonucleotide",
"dna": "polydeoxyribonucleotide",
}
[docs]
def mmcif_from_structure(structure: StructureDefinition, /, *, name: str = "pred") -> gemmi.cif.Block:
"""
Convert a StructureDefinition to an mmCIF block.
Creates mmCIF categories for entities, polymers, non-polymers, branched polymers,
and their corresponding chains and numbering schemes.
Args:
structure: The structure definition to convert.
name: Name for the mmCIF data block. Defaults to "pred".
Returns:
An mmCIF block containing the structure information.
"""
block = gemmi.cif.Block(name)
block.set_mmcif_category(
"_entity",
pivot(
[
{
"id": str(entity_idx + 1),
"type": "non-polymer" if definition.type == "ligand_ccd" else "polymer",
"pdbx_description": definition.description,
}
for entity_idx, definition in enumerate(structure.entities)
]
),
)
block.set_mmcif_category(
"_entity_poly",
pivot(
[
{
"entity_id": str(entity_idx + 1),
"type": BOLTZ_POLYMER_TO_MMCIF_POLYMER[definition.type],
}
for entity_idx, definition in enumerate(structure.entities)
if definition.type in {"protein", "rna", "dna"}
]
),
)
block.set_mmcif_category(
"_entity_poly_seq",
pivot(
[
{"entity_id": str(entity_idx + 1), "num": str(res_idx + 1), "mon_id": residue_name, "hetero": "n"}
for entity_idx, definition in enumerate(structure.entities)
if definition.type == "protein" or definition.type == "rna" or definition.type == "dna"
for res_idx, residue_name in enumerate(
residue_names_from_sequence(definition.sequence, polymer_type=definition.type)
)
]
),
)
block.set_mmcif_category(
"_pdbx_poly_seq_scheme",
pivot(
[
{
"asym_id": chain_id,
"entity_id": chain.entity_idx + 1,
"seq_id": residue_idx + 1,
"pdb_seq_num": residue_number,
}
for chain_id, chain in structure.chains.items()
if (entity := structure.entities[chain.entity_idx]).type == "protein"
or entity.type == "rna"
or entity.type == "dna"
for residue_idx, residue_number in enumerate(
chain.residue_numbers
if chain.residue_numbers is not None
else np.arange(1, len(entity.sequence) + 1)
)
]
),
)
block.set_mmcif_category(
"_pdbx_entity_nonpoly",
pivot(
[
{"entity_id": str(entity_idx + 1), "comp_id": definition.comp_id}
for entity_idx, definition in enumerate(structure.entities)
if definition.type == "ligand_ccd"
]
),
)
block.set_mmcif_category(
"_pdbx_nonpoly_scheme",
pivot(
[
{
"asym_id": chain_id,
"entity_id": chain.entity_idx + 1,
"mon_id": entity.comp_id,
"pdb_seq_num": chain.residue_numbers[0] if chain.residue_numbers is not None else 1,
}
for chain_id, chain in structure.chains.items()
if (entity := structure.entities[chain.entity_idx]).type == "ligand_ccd"
]
),
)
block.set_mmcif_category(
"_pdbx_entity_branch_list",
pivot(
[
{"entity_id": str(entity_idx + 1), "num": res_idx + 1, "comp_id": residue_name}
for entity_idx, definition in enumerate(structure.entities)
if definition.type == "branched_polymer"
for res_idx, residue_name in enumerate(definition.comp_ids)
]
),
)
block.set_mmcif_category(
"_pdbx_branch_scheme",
pivot(
[
{"asym_id": chain_id, "entity_id": chain.entity_idx + 1, "num": res_idx + 1, "pdb_seq_num": resnum}
for chain_id, chain in structure.chains.items()
if (entity := structure.entities[chain.entity_idx]).type == "branched_polymer"
for res_idx, resnum in enumerate(
chain.residue_numbers
if chain.residue_numbers is not None
else np.arange(1, len(entity.comp_ids) + 1)
)
]
),
)
block.set_mmcif_category(
"_struct_asym",
pivot([{"id": chain_id, "entity_id": chain.entity_idx + 1} for chain_id, chain in structure.chains.items()]),
)
return block