Source code for boltz_data.mol._from._from_mmcif

"""Functions for creating BZMol objects from mmCIF files."""

from collections.abc import Mapping
from functools import lru_cache

import gemmi
import numpy as np

from boltz_data.ccd import ChemicalComponent
from boltz_data.cif import get_structure_from_mmcif
from boltz_data.cif._utils import clean_string
from boltz_data.mol._mol import BZBioMol

from ._atom_mapping import create_atom_mapping
from ._constants import ALT_LOC_A, ALT_LOC_DEFAULT, WATER_RESIDUE
from ._from_definition import bzmol_from_structure


# Cache cleaned strings to avoid repeated string operations
@lru_cache(maxsize=10000)
def _clean_string_cached(s: str) -> str:
    """Cache clean_string for frequently used atom names."""
    return clean_string(s)



[docs]
def bzmol_from_mmcif(
    mmcif: gemmi.cif.Block,
    *,
    chemical_component_dictionary: Mapping[str, ChemicalComponent] | None = None,
) -> BZBioMol:
    """
    Create a BZMol from an mmCIF file with coordinates.

    This function:
    1. Parses entity definitions from the mmCIF
    2. Creates BZMols for each entity instance
    3. Concatenates them into a single structure
    4. Maps atom coordinates from the mmCIF to the BZMol

    Args:
        mmcif: The mmCIF block containing structure data.
        chemical_component_dictionary: Dictionary mapping component IDs to ChemicalComponent objects.

    Returns:
        A BZMol containing all atoms with their coordinates and a mask indicating
        which atoms have valid coordinates.

    """
    structure = get_structure_from_mmcif(mmcif)

    bzmol = bzmol_from_structure(structure, chemical_component_dictionary=chemical_component_dictionary)

    # Map coordinates from mmCIF to the combined BZMol
    return _map_coordinates_to_bzmol(mmcif, bzmol)



def _map_coordinates_to_bzmol(  # noqa: C901
    mmcif: gemmi.cif.Block,
    bzmol: BZBioMol,
    /,
) -> BZBioMol:
    """
    Map atom coordinates from mmCIF to the BZBioMol structure.

    Args:
        mmcif: The mmCIF block containing atom coordinates.
        bzmol: The BZBioMol structure without coordinates.

    Returns:
        A new BZBioMol with coordinates and coordinate mask.

    """
    # Initialize coordinates and mask
    atom_coordinates = np.zeros((bzmol.num_atoms, 3), dtype=np.float32)
    atom_b_factor = np.zeros(bzmol.num_atoms, dtype=np.float32)
    atom_resolved = np.zeros(bzmol.num_atoms, dtype=bool)

    # Create a mapping from (chain_id, residue_name, atom_name) to BZMol atom index
    atom_mapping = create_atom_mapping(bzmol)

    # Mapping for branched polymers which don't have sequence numbers
    chain_id_and_residue_number_to_seq_num: dict[tuple[str, int], int] = {}
    for asym_id, seq_num, residue_number in mmcif.find("_pdbx_branch_scheme.", ["asym_id", "num", "pdb_seq_num"]):
        chain_id_and_residue_number_to_seq_num[(asym_id, int(residue_number))] = int(seq_num)

    # Read atom coordinates from mmCIF
    atom_site_columns = [
        "label_asym_id",  # chain ID
        "label_seq_id",  # residue sequence number
        "label_comp_id",  # residue name
        "label_atom_id",  # atom name
        "label_alt_id",  # alternative location indicator
        "Cartn_x",  # x coordinate
        "Cartn_y",  # y coordinate
        "Cartn_z",  # z coordinate
        "type_symbol",  # element symbol
        "pdbx_PDB_model_num",  # model number
        "auth_seq_id",  # author residue sequence number
        "B_iso_or_equiv",  # B-factor
    ]

    # Collect valid atoms for batch processing
    valid_atoms = []
    coords_batch = []
    bfactors_batch = []

    for row in mmcif.find("_atom_site.", atom_site_columns):
        residue_name = row[2]

        # Early filtering for performance
        if residue_name == WATER_RESIDUE:
            continue  # Skip water molecules

        model_num = int(row[9]) if row[9] != "." else 1

        # Skip atoms from models other than model 1 (for NMR structures)
        if model_num != 1:
            continue

        element = row[8]

        # Skip hydrogen atoms
        if element == "H":
            continue

        alt_loc = row[4]

        # Skip atoms with alternative locations other than '.' or 'A'
        if alt_loc not in (ALT_LOC_DEFAULT, ALT_LOC_A):
            continue

        seq_id = row[1]
        chain_id = row[0]
        auth_seq_num = row[10]

        if seq_id == "." and (chain_id, int(auth_seq_num)) in chain_id_and_residue_number_to_seq_num:
            seq_id_int = chain_id_and_residue_number_to_seq_num[(chain_id, int(auth_seq_num))]
        else:
            seq_id_int = int(seq_id) if seq_id != "." else 1  # Handle missing seq_id

        atom_name = row[3]

        # Use cached string cleaning for performance
        cleaned_atom_name = _clean_string_cached(atom_name)

        # Find the corresponding atom in the BZMol
        key = (chain_id, seq_id_int, residue_name, cleaned_atom_name)
        try:
            atom_idx = atom_mapping[key]
        except KeyError:
            msg = (
                f"Found coordinate for unknown atom: "
                f"chain={chain_id}, seq={seq_id_int}, "
                f"residue={residue_name}, atom={atom_name}"
            )
            raise ValueError(msg) from None

        # Check if this atom already has coordinates assigned
        if atom_resolved[atom_idx]:
            msg = (
                f"Duplicate coordinate assignment for atom: "
                f"chain={chain_id}, seq={seq_id_int}, residue={residue_name}, atom={atom_name} "
                f"(BZMol atom index {atom_idx})"
            )
            raise ValueError(msg)

        # Collect for batch processing
        valid_atoms.append(atom_idx)
        coords_batch.append([float(row[5]), float(row[6]), float(row[7])])
        bfactors_batch.append(float(row[11]) if row[11] != "." else 0.0)

    # Batch assign coordinates using NumPy for better performance
    if valid_atoms:
        valid_indices = np.array(valid_atoms)
        atom_coordinates[valid_indices] = np.array(coords_batch, dtype=np.float32)
        atom_b_factor[valid_indices] = np.array(bfactors_batch, dtype=np.float32)
        atom_resolved[valid_indices] = True

    return BZBioMol(  # type: ignore[missing-argument]
        **{
            **bzmol.model_dump(),
            "atom_coordinates": atom_coordinates,
            "atom_resolved": atom_resolved,
            "atom_b_factor": atom_b_factor,
        }
    )