Source code for boltz_data.sequence._cluster

import subprocess
import tempfile
from collections.abc import Collection
from pathlib import Path
from typing import Literal

import polars as pl


[docs] def cluster_sequences( *, sequences: Collection[str], min_seq_id: float = 0.4, polymer_type: Literal["protein", "rna", "dna"] | None = None ) -> list[int]: """ Cluster sequences using MMseqs2. Args: sequences: List of sequences to cluster. min_seq_id: Minimum sequence identity for clustering. polymer_type: Type of sequences. One of "protein", "dna", or "rna". If None, type is inferred from sequences. Returns: List of cluster IDs, in the same order as the input sequences. """ if len(sequences) == 0: return [] with tempfile.TemporaryDirectory() as tmpdir: fasta_path = Path(tmpdir) / "sequences.fasta" output_path = Path(tmpdir) / "cluster" temp_path = Path(tmpdir) / "temp" _write_fasta(sequences=sequences, path=fasta_path) subprocess.run( # noqa: S603 [ # noqa: S607 "mmseqs", "easy-cluster", fasta_path, output_path, temp_path, "--dbtype", _get_dbtype_from_polymer_type(polymer_type), "--min-seq-id", str(min_seq_id), "-v", "1", ], check=False, ) output_df = pl.read_csv( f"{output_path}_cluster.tsv", separator="\t", has_header=False, new_columns=["cluster_id", "sequence_id"] ) clusters = output_df.sort("sequence_id")["cluster_id"] unique_clusters = clusters.unique(maintain_order=True).to_list() return [unique_clusters.index(i) for i in clusters]
def _get_dbtype_from_polymer_type(polymer_type: Literal["protein", "rna", "dna"] | None = None) -> str: match polymer_type: case None: return "0" case "protein": return "1" case "rna" | "dna": return "2" def _write_fasta(*, sequences: Collection[str], path: str | Path) -> None: with Path(path).open("w") as f: f.writelines(f">{i}\n{sequence}\n" for i, sequence in enumerate(sequences))