Source code for boltz_data.ccd._decompress

"""Decompression utilities for chemical components."""

import io

from ._constants import SYMBOL_TO_ID
from ._models import ChemicalComponent, ChemicalComponentAtom, ChemicalComponentBond


def read_null_terminated_string(data: io.BytesIO) -> str:
    """Read a null-terminated ASCII string from the data stream."""
    chars = []
    while True:
        char = data.read(1)
        if not char or char == b"\x00":
            break
        chars.append(char)
    return b"".join(chars).decode("ascii")



[docs]
def decompress_chemical_component(compressed: bytes, /) -> ChemicalComponent:
    """
    Decompress a chemical component from its compressed binary representation.

    The compression format is a custom binary encoding that minimizes size by:
    - Using null-terminated ASCII strings for text fields
    - Storing element symbols as single-byte periodic table indices
    - Using 1-byte or 2-byte integers for indices based on molecule size
    - Encoding bonds by atom indices rather than atom IDs to save space

    Binary format structure:

    1. Component metadata:

       * Null-terminated ASCII string: comp_id
       * Null-terminated ASCII string: type
       * Null-terminated ASCII string: name

    2. Size indicator:

       * 1 byte: large_molecule flag (1 if num_atoms > 255 or num_bonds > 255)
       * 1 or 2 bytes: number of atoms (based on large_molecule flag)

    3. Atom records (repeated for each atom):

       * Null-terminated ASCII string: atom_id
       * 1 byte: element as periodic table index (1-120)

    4. Bond records:

       * 1 or 2 bytes: number of bonds (based on large_molecule flag)
       * For each bond:

         * 1 or 2 bytes: index of first atom in atom list
         * 1 or 2 bytes: index of second atom in atom list
         * 1 byte: bond order (1=single, 2=double, 3=triple)

    Args:
        compressed: The compressed chemical component as bytes.

    Returns:
        The decompressed ChemicalComponent object.

    """
    data = io.BytesIO(compressed)

    # Read component metadata using null-terminated strings
    comp_id = read_null_terminated_string(data)
    type_ = read_null_terminated_string(data)
    name = read_null_terminated_string(data)

    # Read size information
    large_molecule = int.from_bytes(data.read(1), "big") == 1
    int_size = 2 if large_molecule else 1
    num_atoms = int.from_bytes(data.read(int_size), "big")

    # Read atoms
    atoms: dict[str, ChemicalComponentAtom] = {}
    atom_ids: list[str] = []

    # Create reverse mapping from ID to symbol
    id_to_symbol = {v: k for k, v in SYMBOL_TO_ID.items()}

    for _ in range(num_atoms):
        atom_id = read_null_terminated_string(data)
        element_id = int.from_bytes(data.read(1), "big")
        charge = int.from_bytes(data.read(1), "big", signed=True)

        element = id_to_symbol[element_id]
        atoms[atom_id] = ChemicalComponentAtom(atom_id=atom_id, element=element, charge=charge)
        atom_ids.append(atom_id)

    # Read bonds
    num_bonds = int.from_bytes(data.read(int_size), "big")
    bonds: dict[tuple[str, str], ChemicalComponentBond] = {}

    for _ in range(num_bonds):
        atom1_idx = int.from_bytes(data.read(int_size), "big")
        atom2_idx = int.from_bytes(data.read(int_size), "big")
        order = int.from_bytes(data.read(1), "big")

        atom_id_1 = atom_ids[atom1_idx]
        atom_id_2 = atom_ids[atom2_idx]
        bonds[(atom_id_1, atom_id_2)] = ChemicalComponentBond(
            atom_id_1=atom_id_1,
            atom_id_2=atom_id_2,
            order=order,
        )

    return ChemicalComponent(
        comp_id=comp_id,
        type=type_,
        name=name,
        atoms=atoms,
        bonds=bonds,
    )