Source code for boltz_data.sequence._cluster
import subprocess
import tempfile
from collections.abc import Collection
from pathlib import Path
from typing import Literal
import polars as pl
[docs]
def cluster_sequences(
*, sequences: Collection[str], min_seq_id: float = 0.4, polymer_type: Literal["protein", "rna", "dna"] | None = None
) -> list[int]:
"""
Cluster sequences using MMseqs2.
Args:
sequences: List of sequences to cluster.
min_seq_id: Minimum sequence identity for clustering.
polymer_type: Type of sequences. One of "protein", "dna", or "rna". If None, type is inferred from sequences.
Returns:
List of cluster IDs, in the same order as the input sequences.
"""
if len(sequences) == 0:
return []
with tempfile.TemporaryDirectory() as tmpdir:
fasta_path = Path(tmpdir) / "sequences.fasta"
output_path = Path(tmpdir) / "cluster"
temp_path = Path(tmpdir) / "temp"
_write_fasta(sequences=sequences, path=fasta_path)
subprocess.run( # noqa: S603
[ # noqa: S607
"mmseqs",
"easy-cluster",
fasta_path,
output_path,
temp_path,
"--dbtype",
_get_dbtype_from_polymer_type(polymer_type),
"--min-seq-id",
str(min_seq_id),
"-v",
"1",
],
check=False,
)
output_df = pl.read_csv(
f"{output_path}_cluster.tsv", separator="\t", has_header=False, new_columns=["cluster_id", "sequence_id"]
)
clusters = output_df.sort("sequence_id")["cluster_id"]
unique_clusters = clusters.unique(maintain_order=True).to_list()
return [unique_clusters.index(i) for i in clusters]
def _get_dbtype_from_polymer_type(polymer_type: Literal["protein", "rna", "dna"] | None = None) -> str:
match polymer_type:
case None:
return "0"
case "protein":
return "1"
case "rna" | "dna":
return "2"
def _write_fasta(*, sequences: Collection[str], path: str | Path) -> None:
with Path(path).open("w") as f:
f.writelines(f">{i}\n{sequence}\n" for i, sequence in enumerate(sequences))