Fill-Mask
Transformers
Safetensors
esm
FusOn-pLM / fuson_plm /data /cluster.py
root
uploading data folder
1e6a1f0
raw
history blame
2.1 kB
import pandas as pd
import numpy as np
import os
import subprocess
import sys
from Bio import SeqIO
import shutil
from fuson_plm.utils.logging import open_logfile, log_update
from fuson_plm.data.config import CLUSTER
def main():
# Read all the input args
LOG_PATH = "clustering_log.txt"
INPUT_PATH = CLUSTER.INPUT_PATH
MIN_SEQ_ID = CLUSTER.MIN_SEQ_ID
C = CLUSTER.C
COV_MODE = CLUSTER.COV_MODE
PATH_TO_MMSEQS = CLUSTER.PATH_TO_MMSEQS
MAX_SEQ_LENGTH = CLUSTER.MAX_SEQ_LENGTH
with open_logfile(LOG_PATH):
log_update("Input params from config.py:")
CLUSTER.print_config(indent='\t')
# Make a subfolder for clustering results, and direct MMSeqs2 outputs here
if not(os.path.exists("clustering")):
os.mkdir("clustering")
output_dir = "clustering/raw_output"
# Make fasta of input file
sequences = pd.read_csv(INPUT_PATH)
log_update(f"\nPreparing input data...\n\tInitial dataset size: {len(sequences)} sequences")
sequences = sequences.loc[sequences['aa_seq'].str.len() <= MAX_SEQ_LENGTH].reset_index(drop=True)
log_update(f"\tApplied length cutoff of {MAX_SEQ_LENGTH}AAs. New dataset size: {len(sequences)} sequences")
sequences = dict(zip(sequences['seq_id'],sequences['aa_seq']))
fasta_path = make_fasta(sequences, "clustering/input.fasta")
log_update(f"\tMade fasta of input sequences, saved at {fasta_path}")
run_mmseqs_clustering(fasta_path, output_dir, min_seq_id=MIN_SEQ_ID, c=C, cov_mode=COV_MODE, path_to_mmseqs=PATH_TO_MMSEQS)
# Brief read to preview results
clusters = analyze_clustering_result('clustering/input.fasta', 'clustering/raw_output/mmseqs_cluster.tsv')
# Save clusters
clusters.to_csv('clustering/mmseqs_full_results.csv',index=False)
log_update("Processed and combined mmseqs output. Wrote comprehensive results to clustering/mmseqs_full_results.csv")
cluster_summary(clusters)
if __name__ == "__main__":
main()