Source code for boltz_data.mol._from._from_mmcif

"""Functions for creating BZMol objects from mmCIF files."""

from collections.abc import Mapping
from functools import lru_cache

import gemmi
import numpy as np

from boltz_data.ccd import ChemicalComponent
from boltz_data.cif import get_structure_from_mmcif
from boltz_data.cif._utils import clean_string
from boltz_data.mol._mol import BZBioMol

from ._atom_mapping import create_atom_mapping
from ._constants import ALT_LOC_A, ALT_LOC_DEFAULT, WATER_RESIDUE
from ._from_definition import bzmol_from_structure


# Cache cleaned strings to avoid repeated string operations
@lru_cache(maxsize=10000)
def _clean_string_cached(s: str) -> str:
    """Cache clean_string for frequently used atom names."""
    return clean_string(s)


[docs] def bzmol_from_mmcif( mmcif: gemmi.cif.Block, *, chemical_component_dictionary: Mapping[str, ChemicalComponent] | None = None, ) -> BZBioMol: """ Create a BZMol from an mmCIF file with coordinates. This function: 1. Parses entity definitions from the mmCIF 2. Creates BZMols for each entity instance 3. Concatenates them into a single structure 4. Maps atom coordinates from the mmCIF to the BZMol Args: mmcif: The mmCIF block containing structure data. chemical_component_dictionary: Dictionary mapping component IDs to ChemicalComponent objects. Returns: A BZMol containing all atoms with their coordinates and a mask indicating which atoms have valid coordinates. """ structure = get_structure_from_mmcif(mmcif) bzmol = bzmol_from_structure(structure, chemical_component_dictionary=chemical_component_dictionary) # Map coordinates from mmCIF to the combined BZMol return _map_coordinates_to_bzmol(mmcif, bzmol)
def _map_coordinates_to_bzmol( # noqa: C901 mmcif: gemmi.cif.Block, bzmol: BZBioMol, /, ) -> BZBioMol: """ Map atom coordinates from mmCIF to the BZBioMol structure. Args: mmcif: The mmCIF block containing atom coordinates. bzmol: The BZBioMol structure without coordinates. Returns: A new BZBioMol with coordinates and coordinate mask. """ # Initialize coordinates and mask atom_coordinates = np.zeros((bzmol.num_atoms, 3), dtype=np.float32) atom_b_factor = np.zeros(bzmol.num_atoms, dtype=np.float32) atom_resolved = np.zeros(bzmol.num_atoms, dtype=bool) # Create a mapping from (chain_id, residue_name, atom_name) to BZMol atom index atom_mapping = create_atom_mapping(bzmol) # Mapping for branched polymers which don't have sequence numbers chain_id_and_residue_number_to_seq_num: dict[tuple[str, int], int] = {} for asym_id, seq_num, residue_number in mmcif.find("_pdbx_branch_scheme.", ["asym_id", "num", "pdb_seq_num"]): chain_id_and_residue_number_to_seq_num[(asym_id, int(residue_number))] = int(seq_num) # Read atom coordinates from mmCIF atom_site_columns = [ "label_asym_id", # chain ID "label_seq_id", # residue sequence number "label_comp_id", # residue name "label_atom_id", # atom name "label_alt_id", # alternative location indicator "Cartn_x", # x coordinate "Cartn_y", # y coordinate "Cartn_z", # z coordinate "type_symbol", # element symbol "pdbx_PDB_model_num", # model number "auth_seq_id", # author residue sequence number "B_iso_or_equiv", # B-factor ] # Collect valid atoms for batch processing valid_atoms = [] coords_batch = [] bfactors_batch = [] for row in mmcif.find("_atom_site.", atom_site_columns): residue_name = row[2] # Early filtering for performance if residue_name == WATER_RESIDUE: continue # Skip water molecules model_num = int(row[9]) if row[9] != "." else 1 # Skip atoms from models other than model 1 (for NMR structures) if model_num != 1: continue element = row[8] # Skip hydrogen atoms if element == "H": continue alt_loc = row[4] # Skip atoms with alternative locations other than '.' or 'A' if alt_loc not in (ALT_LOC_DEFAULT, ALT_LOC_A): continue seq_id = row[1] chain_id = row[0] auth_seq_num = row[10] if seq_id == "." and (chain_id, int(auth_seq_num)) in chain_id_and_residue_number_to_seq_num: seq_id_int = chain_id_and_residue_number_to_seq_num[(chain_id, int(auth_seq_num))] else: seq_id_int = int(seq_id) if seq_id != "." else 1 # Handle missing seq_id atom_name = row[3] # Use cached string cleaning for performance cleaned_atom_name = _clean_string_cached(atom_name) # Find the corresponding atom in the BZMol key = (chain_id, seq_id_int, residue_name, cleaned_atom_name) try: atom_idx = atom_mapping[key] except KeyError: msg = ( f"Found coordinate for unknown atom: " f"chain={chain_id}, seq={seq_id_int}, " f"residue={residue_name}, atom={atom_name}" ) raise ValueError(msg) from None # Check if this atom already has coordinates assigned if atom_resolved[atom_idx]: msg = ( f"Duplicate coordinate assignment for atom: " f"chain={chain_id}, seq={seq_id_int}, residue={residue_name}, atom={atom_name} " f"(BZMol atom index {atom_idx})" ) raise ValueError(msg) # Collect for batch processing valid_atoms.append(atom_idx) coords_batch.append([float(row[5]), float(row[6]), float(row[7])]) bfactors_batch.append(float(row[11]) if row[11] != "." else 0.0) # Batch assign coordinates using NumPy for better performance if valid_atoms: valid_indices = np.array(valid_atoms) atom_coordinates[valid_indices] = np.array(coords_batch, dtype=np.float32) atom_b_factor[valid_indices] = np.array(bfactors_batch, dtype=np.float32) atom_resolved[valid_indices] = True return BZBioMol( # type: ignore[missing-argument] **{ **bzmol.model_dump(), "atom_coordinates": atom_coordinates, "atom_resolved": atom_resolved, "atom_b_factor": atom_b_factor, } )