Source code for boltz_data.ccd._compress

"""Compression utilities for chemical components."""

import io

from ._constants import MAX_SINGLE_BYTE, SYMBOL_TO_ID
from ._models import ChemicalComponent


[docs] def compress_chemical_component(chemical_component: ChemicalComponent, /) -> bytes: """ Compress a chemical component to a compact binary representation. The compression format minimizes size by: - Using null-terminated ASCII strings for text fields - Storing element symbols as single-byte periodic table indices - Using 1-byte or 2-byte integers for indices based on molecule size - Encoding bonds by atom indices rather than atom IDs to save space - Packing leaving atom flags as single bits within atom records Args: chemical_component: The chemical component to compress. Returns: The compressed representation as bytes. """ out = io.BytesIO() # Write component metadata with null-terminated strings out.write(chemical_component.comp_id.encode("ascii")) out.write(b"\x00") # Null terminator out.write(chemical_component.type.encode("ascii")) out.write(b"\x00") # Null terminator out.write(chemical_component.name.encode("ascii")) out.write(b"\x00") # Null terminator # Determine size requirements num_atoms = len(chemical_component.atoms) num_bonds = len(chemical_component.bonds) large_molecule = num_atoms > MAX_SINGLE_BYTE or num_bonds > MAX_SINGLE_BYTE # Write size information out.write((1 if large_molecule else 0).to_bytes(1, "big")) out.write(num_atoms.to_bytes(2 if large_molecule else 1, "big")) # Write atoms atom_ids: list[str] = [] for atom in chemical_component.atoms.values(): out.write(atom.atom_id.encode("ascii")) out.write(b"\x00") # Null terminator out.write(SYMBOL_TO_ID[atom.element].to_bytes(1, "big")) out.write(atom.charge.to_bytes(1, "big", signed=True)) atom_ids.append(atom.atom_id) # Write bonds out.write(num_bonds.to_bytes(2 if large_molecule else 1, "big")) for bond in chemical_component.bonds.values(): out.write(atom_ids.index(bond.atom_id_1).to_bytes(2 if large_molecule else 1, "big")) out.write(atom_ids.index(bond.atom_id_2).to_bytes(2 if large_molecule else 1, "big")) out.write(bond.order.to_bytes(1, "big")) return out.getvalue()