"""Functions for creating BZMol objects from mmCIF files."""
from collections.abc import Mapping
from functools import lru_cache
import gemmi
import numpy as np
from boltz_data.ccd import ChemicalComponent
from boltz_data.cif import get_structure_from_mmcif
from boltz_data.cif._utils import clean_string
from boltz_data.mol._mol import BZBioMol
from ._atom_mapping import create_atom_mapping
from ._constants import ALT_LOC_A, ALT_LOC_DEFAULT, WATER_RESIDUE
from ._from_definition import bzmol_from_structure
# Cache cleaned strings to avoid repeated string operations
@lru_cache(maxsize=10000)
def _clean_string_cached(s: str) -> str:
"""Cache clean_string for frequently used atom names."""
return clean_string(s)
[docs]
def bzmol_from_mmcif(
mmcif: gemmi.cif.Block,
*,
chemical_component_dictionary: Mapping[str, ChemicalComponent] | None = None,
) -> BZBioMol:
"""
Create a BZMol from an mmCIF file with coordinates.
This function:
1. Parses entity definitions from the mmCIF
2. Creates BZMols for each entity instance
3. Concatenates them into a single structure
4. Maps atom coordinates from the mmCIF to the BZMol
Args:
mmcif: The mmCIF block containing structure data.
chemical_component_dictionary: Dictionary mapping component IDs to ChemicalComponent objects.
Returns:
A BZMol containing all atoms with their coordinates and a mask indicating
which atoms have valid coordinates.
"""
structure = get_structure_from_mmcif(mmcif)
bzmol = bzmol_from_structure(structure, chemical_component_dictionary=chemical_component_dictionary)
# Map coordinates from mmCIF to the combined BZMol
return _map_coordinates_to_bzmol(mmcif, bzmol)
def _map_coordinates_to_bzmol( # noqa: C901
mmcif: gemmi.cif.Block,
bzmol: BZBioMol,
/,
) -> BZBioMol:
"""
Map atom coordinates from mmCIF to the BZBioMol structure.
Args:
mmcif: The mmCIF block containing atom coordinates.
bzmol: The BZBioMol structure without coordinates.
Returns:
A new BZBioMol with coordinates and coordinate mask.
"""
# Initialize coordinates and mask
atom_coordinates = np.zeros((bzmol.num_atoms, 3), dtype=np.float32)
atom_b_factor = np.zeros(bzmol.num_atoms, dtype=np.float32)
atom_resolved = np.zeros(bzmol.num_atoms, dtype=bool)
# Create a mapping from (chain_id, residue_name, atom_name) to BZMol atom index
atom_mapping = create_atom_mapping(bzmol)
# Mapping for branched polymers which don't have sequence numbers
chain_id_and_residue_number_to_seq_num: dict[tuple[str, int], int] = {}
for asym_id, seq_num, residue_number in mmcif.find("_pdbx_branch_scheme.", ["asym_id", "num", "pdb_seq_num"]):
chain_id_and_residue_number_to_seq_num[(asym_id, int(residue_number))] = int(seq_num)
# Read atom coordinates from mmCIF
atom_site_columns = [
"label_asym_id", # chain ID
"label_seq_id", # residue sequence number
"label_comp_id", # residue name
"label_atom_id", # atom name
"label_alt_id", # alternative location indicator
"Cartn_x", # x coordinate
"Cartn_y", # y coordinate
"Cartn_z", # z coordinate
"type_symbol", # element symbol
"pdbx_PDB_model_num", # model number
"auth_seq_id", # author residue sequence number
"B_iso_or_equiv", # B-factor
]
# Collect valid atoms for batch processing
valid_atoms = []
coords_batch = []
bfactors_batch = []
for row in mmcif.find("_atom_site.", atom_site_columns):
residue_name = row[2]
# Early filtering for performance
if residue_name == WATER_RESIDUE:
continue # Skip water molecules
model_num = int(row[9]) if row[9] != "." else 1
# Skip atoms from models other than model 1 (for NMR structures)
if model_num != 1:
continue
element = row[8]
# Skip hydrogen atoms
if element == "H":
continue
alt_loc = row[4]
# Skip atoms with alternative locations other than '.' or 'A'
if alt_loc not in (ALT_LOC_DEFAULT, ALT_LOC_A):
continue
seq_id = row[1]
chain_id = row[0]
auth_seq_num = row[10]
if seq_id == "." and (chain_id, int(auth_seq_num)) in chain_id_and_residue_number_to_seq_num:
seq_id_int = chain_id_and_residue_number_to_seq_num[(chain_id, int(auth_seq_num))]
else:
seq_id_int = int(seq_id) if seq_id != "." else 1 # Handle missing seq_id
atom_name = row[3]
# Use cached string cleaning for performance
cleaned_atom_name = _clean_string_cached(atom_name)
# Find the corresponding atom in the BZMol
key = (chain_id, seq_id_int, residue_name, cleaned_atom_name)
try:
atom_idx = atom_mapping[key]
except KeyError:
msg = (
f"Found coordinate for unknown atom: "
f"chain={chain_id}, seq={seq_id_int}, "
f"residue={residue_name}, atom={atom_name}"
)
raise ValueError(msg) from None
# Check if this atom already has coordinates assigned
if atom_resolved[atom_idx]:
msg = (
f"Duplicate coordinate assignment for atom: "
f"chain={chain_id}, seq={seq_id_int}, residue={residue_name}, atom={atom_name} "
f"(BZMol atom index {atom_idx})"
)
raise ValueError(msg)
# Collect for batch processing
valid_atoms.append(atom_idx)
coords_batch.append([float(row[5]), float(row[6]), float(row[7])])
bfactors_batch.append(float(row[11]) if row[11] != "." else 0.0)
# Batch assign coordinates using NumPy for better performance
if valid_atoms:
valid_indices = np.array(valid_atoms)
atom_coordinates[valid_indices] = np.array(coords_batch, dtype=np.float32)
atom_b_factor[valid_indices] = np.array(bfactors_batch, dtype=np.float32)
atom_resolved[valid_indices] = True
return BZBioMol( # type: ignore[missing-argument]
**{
**bzmol.model_dump(),
"atom_coordinates": atom_coordinates,
"atom_resolved": atom_resolved,
"atom_b_factor": atom_b_factor,
}
)