Source code for boltz_data.ccd._compress
"""Compression utilities for chemical components."""
import io
from ._constants import MAX_SINGLE_BYTE, SYMBOL_TO_ID
from ._models import ChemicalComponent
[docs]
def compress_chemical_component(chemical_component: ChemicalComponent, /) -> bytes:
"""
Compress a chemical component to a compact binary representation.
The compression format minimizes size by:
- Using null-terminated ASCII strings for text fields
- Storing element symbols as single-byte periodic table indices
- Using 1-byte or 2-byte integers for indices based on molecule size
- Encoding bonds by atom indices rather than atom IDs to save space
- Packing leaving atom flags as single bits within atom records
Args:
chemical_component: The chemical component to compress.
Returns:
The compressed representation as bytes.
"""
out = io.BytesIO()
# Write component metadata with null-terminated strings
out.write(chemical_component.comp_id.encode("ascii"))
out.write(b"\x00") # Null terminator
out.write(chemical_component.type.encode("ascii"))
out.write(b"\x00") # Null terminator
out.write(chemical_component.name.encode("ascii"))
out.write(b"\x00") # Null terminator
# Determine size requirements
num_atoms = len(chemical_component.atoms)
num_bonds = len(chemical_component.bonds)
large_molecule = num_atoms > MAX_SINGLE_BYTE or num_bonds > MAX_SINGLE_BYTE
# Write size information
out.write((1 if large_molecule else 0).to_bytes(1, "big"))
out.write(num_atoms.to_bytes(2 if large_molecule else 1, "big"))
# Write atoms
atom_ids: list[str] = []
for atom in chemical_component.atoms.values():
out.write(atom.atom_id.encode("ascii"))
out.write(b"\x00") # Null terminator
out.write(SYMBOL_TO_ID[atom.element].to_bytes(1, "big"))
out.write(atom.charge.to_bytes(1, "big", signed=True))
atom_ids.append(atom.atom_id)
# Write bonds
out.write(num_bonds.to_bytes(2 if large_molecule else 1, "big"))
for bond in chemical_component.bonds.values():
out.write(atom_ids.index(bond.atom_id_1).to_bytes(2 if large_molecule else 1, "big"))
out.write(atom_ids.index(bond.atom_id_2).to_bytes(2 if large_molecule else 1, "big"))
out.write(bond.order.to_bytes(1, "big"))
return out.getvalue()