Source code for boltz_data.ccd._decompress
"""Decompression utilities for chemical components."""
import io
from ._constants import SYMBOL_TO_ID
from ._models import ChemicalComponent, ChemicalComponentAtom, ChemicalComponentBond
def read_null_terminated_string(data: io.BytesIO) -> str:
"""Read a null-terminated ASCII string from the data stream."""
chars = []
while True:
char = data.read(1)
if not char or char == b"\x00":
break
chars.append(char)
return b"".join(chars).decode("ascii")
[docs]
def decompress_chemical_component(compressed: bytes, /) -> ChemicalComponent:
"""
Decompress a chemical component from its compressed binary representation.
The compression format is a custom binary encoding that minimizes size by:
- Using null-terminated ASCII strings for text fields
- Storing element symbols as single-byte periodic table indices
- Using 1-byte or 2-byte integers for indices based on molecule size
- Encoding bonds by atom indices rather than atom IDs to save space
Binary format structure:
1. Component metadata:
* Null-terminated ASCII string: comp_id
* Null-terminated ASCII string: type
* Null-terminated ASCII string: name
2. Size indicator:
* 1 byte: large_molecule flag (1 if num_atoms > 255 or num_bonds > 255)
* 1 or 2 bytes: number of atoms (based on large_molecule flag)
3. Atom records (repeated for each atom):
* Null-terminated ASCII string: atom_id
* 1 byte: element as periodic table index (1-120)
4. Bond records:
* 1 or 2 bytes: number of bonds (based on large_molecule flag)
* For each bond:
* 1 or 2 bytes: index of first atom in atom list
* 1 or 2 bytes: index of second atom in atom list
* 1 byte: bond order (1=single, 2=double, 3=triple)
Args:
compressed: The compressed chemical component as bytes.
Returns:
The decompressed ChemicalComponent object.
"""
data = io.BytesIO(compressed)
# Read component metadata using null-terminated strings
comp_id = read_null_terminated_string(data)
type_ = read_null_terminated_string(data)
name = read_null_terminated_string(data)
# Read size information
large_molecule = int.from_bytes(data.read(1), "big") == 1
int_size = 2 if large_molecule else 1
num_atoms = int.from_bytes(data.read(int_size), "big")
# Read atoms
atoms: dict[str, ChemicalComponentAtom] = {}
atom_ids: list[str] = []
# Create reverse mapping from ID to symbol
id_to_symbol = {v: k for k, v in SYMBOL_TO_ID.items()}
for _ in range(num_atoms):
atom_id = read_null_terminated_string(data)
element_id = int.from_bytes(data.read(1), "big")
charge = int.from_bytes(data.read(1), "big", signed=True)
element = id_to_symbol[element_id]
atoms[atom_id] = ChemicalComponentAtom(atom_id=atom_id, element=element, charge=charge)
atom_ids.append(atom_id)
# Read bonds
num_bonds = int.from_bytes(data.read(int_size), "big")
bonds: dict[tuple[str, str], ChemicalComponentBond] = {}
for _ in range(num_bonds):
atom1_idx = int.from_bytes(data.read(int_size), "big")
atom2_idx = int.from_bytes(data.read(int_size), "big")
order = int.from_bytes(data.read(1), "big")
atom_id_1 = atom_ids[atom1_idx]
atom_id_2 = atom_ids[atom2_idx]
bonds[(atom_id_1, atom_id_2)] = ChemicalComponentBond(
atom_id_1=atom_id_1,
atom_id_2=atom_id_2,
order=order,
)
return ChemicalComponent(
comp_id=comp_id,
type=type_,
name=name,
atoms=atoms,
bonds=bonds,
)