Source code for boltz_data.sequence._cluster

import subprocess
import tempfile
from collections.abc import Collection
from pathlib import Path
from typing import Literal

import polars as pl



[docs]
def cluster_sequences(
    *, sequences: Collection[str], min_seq_id: float = 0.4, polymer_type: Literal["protein", "rna", "dna"] | None = None
) -> list[int]:
    """
    Cluster sequences using MMseqs2.

    Args:
        sequences: List of sequences to cluster.
        min_seq_id: Minimum sequence identity for clustering.
        polymer_type: Type of sequences. One of "protein", "dna", or "rna". If None, type is inferred from sequences.

    Returns:
        List of cluster IDs, in the same order as the input sequences.

    """
    if len(sequences) == 0:
        return []

    with tempfile.TemporaryDirectory() as tmpdir:
        fasta_path = Path(tmpdir) / "sequences.fasta"
        output_path = Path(tmpdir) / "cluster"
        temp_path = Path(tmpdir) / "temp"
        _write_fasta(sequences=sequences, path=fasta_path)

        subprocess.run(  # noqa: S603
            [  # noqa: S607
                "mmseqs",
                "easy-cluster",
                fasta_path,
                output_path,
                temp_path,
                "--dbtype",
                _get_dbtype_from_polymer_type(polymer_type),
                "--min-seq-id",
                str(min_seq_id),
                "-v",
                "1",
            ],
            check=False,
        )

        output_df = pl.read_csv(
            f"{output_path}_cluster.tsv", separator="\t", has_header=False, new_columns=["cluster_id", "sequence_id"]
        )
        clusters = output_df.sort("sequence_id")["cluster_id"]
        unique_clusters = clusters.unique(maintain_order=True).to_list()
        return [unique_clusters.index(i) for i in clusters]



def _get_dbtype_from_polymer_type(polymer_type: Literal["protein", "rna", "dna"] | None = None) -> str:
    match polymer_type:
        case None:
            return "0"
        case "protein":
            return "1"
        case "rna" | "dna":
            return "2"


def _write_fasta(*, sequences: Collection[str], path: str | Path) -> None:
    with Path(path).open("w") as f:
        f.writelines(f">{i}\n{sequence}\n" for i, sequence in enumerate(sequences))