|
from fuson_plm.utils.logging import open_logfile, log_update |
|
from fuson_plm.benchmarking.idr_prediction.config import CLUSTER |
|
from fuson_plm.utils.clustering import ensure_mmseqs_in_path, process_fasta, analyze_clustering_result, make_fasta, run_mmseqs_clustering, cluster_summary |
|
import os |
|
import pandas as pd |
|
|
|
def main_old(): |
|
|
|
LOG_PATH = "clustering_log.txt" |
|
INPUT_PATH = CLUSTER.INPUT_PATH |
|
MIN_SEQ_ID = CLUSTER.MIN_SEQ_ID |
|
C = CLUSTER.C |
|
COV_MODE = CLUSTER.COV_MODE |
|
CLUSTER_MODE = CLUSTER.CLUSTER_MODE |
|
PATH_TO_MMSEQS = CLUSTER.PATH_TO_MMSEQS |
|
|
|
with open_logfile(LOG_PATH): |
|
log_update("Input params from config.py:") |
|
CLUSTER.print_config(indent='\t') |
|
|
|
os.makedirs("clustering",exist_ok=True) |
|
output_dir = "clustering/raw_output" |
|
|
|
|
|
sequences = pd.read_csv(INPUT_PATH) |
|
|
|
sequences = sequences.loc[sequences['Split']=='Train'].reset_index(drop=True) |
|
log_update(f"\nPreparing input data (albatross TRAIN only)...\n\tdataset size: {len(sequences)} sequences") |
|
|
|
max_seqlen = max(sequences['Sequence'].str.len().tolist()) |
|
log_update(f"\tLongest sequence in dataset: {max_seqlen} AAs") |
|
|
|
|
|
sequences['Unique_ID'] = [f"s{i+1}" for i in range(len(sequences))] |
|
sequences['Unique_ID'] = sequences["IDs"].apply(lambda x: "_".join(x.split(','))) + "_" + sequences['Unique_ID'] |
|
log_update("Not all IDs from the database are unique. Created unique IDs by tagging on sequence #s") |
|
log_update(f"\tExample: {sequences.iloc[0]['Unique_ID']}") |
|
sequences = dict(zip(sequences['Unique_ID'],sequences['Sequence'])) |
|
fasta_path = make_fasta(sequences, "clustering/input.fasta") |
|
log_update(f"\tMade fasta of input sequences, saved at {fasta_path}") |
|
|
|
run_mmseqs_clustering(fasta_path, output_dir, min_seq_id=MIN_SEQ_ID, c=C, cov_mode=COV_MODE, cluster_mode=CLUSTER_MODE, path_to_mmseqs=PATH_TO_MMSEQS) |
|
|
|
|
|
clusters = analyze_clustering_result('clustering/input.fasta', 'clustering/raw_output/mmseqs_cluster.tsv') |
|
|
|
clusters.to_csv('clustering/mmseqs_full_results.csv',index=False) |
|
log_update("Processed and combined mmseqs output. Wrote comprehensive results to clustering/mmseqs_full_results.csv") |
|
cluster_summary(clusters) |
|
|
|
def main(): |
|
|
|
LOG_PATH = "clustering_log.txt" |
|
INPUT_PATH = CLUSTER.INPUT_PATH |
|
MIN_SEQ_ID = CLUSTER.MIN_SEQ_ID |
|
C = CLUSTER.C |
|
COV_MODE = CLUSTER.COV_MODE |
|
CLUSTER_MODE = CLUSTER.CLUSTER_MODE |
|
PATH_TO_MMSEQS = CLUSTER.PATH_TO_MMSEQS |
|
|
|
with open_logfile(LOG_PATH): |
|
log_update("Input params from config.py:") |
|
CLUSTER.print_config(indent='\t') |
|
|
|
os.makedirs("clustering",exist_ok=True) |
|
output_dir = "clustering/raw_output" |
|
|
|
|
|
sequences = pd.read_csv(INPUT_PATH) |
|
log_update(f"\nPreparing input data (albatross train AND test sequences)...\n\tdataset size: {len(sequences)} sequences") |
|
|
|
max_seqlen = max(sequences['Sequence'].str.len().tolist()) |
|
log_update(f"\tLongest sequence in dataset: {max_seqlen} AAs") |
|
|
|
|
|
sequences['Unique_ID'] = [f"s{i+1}" for i in range(len(sequences))] |
|
sequences['Unique_ID'] = sequences["IDs"].apply(lambda x: "_".join(x.split(','))) + "_" + sequences['Unique_ID'] |
|
log_update("Not all IDs from the database are unique. Created unique IDs by tagging on sequence #s") |
|
log_update(f"\tExample: {sequences.iloc[0]['Unique_ID']}") |
|
sequences = dict(zip(sequences['Unique_ID'],sequences['Sequence'])) |
|
fasta_path = make_fasta(sequences, "clustering/input.fasta") |
|
log_update(f"\tMade fasta of input sequences, saved at {fasta_path}") |
|
|
|
run_mmseqs_clustering(fasta_path, output_dir, min_seq_id=MIN_SEQ_ID, c=C, cov_mode=COV_MODE, cluster_mode=CLUSTER_MODE, path_to_mmseqs=PATH_TO_MMSEQS) |
|
|
|
|
|
clusters = analyze_clustering_result('clustering/input.fasta', 'clustering/raw_output/mmseqs_cluster.tsv') |
|
|
|
clusters.to_csv('clustering/mmseqs_full_results.csv',index=False) |
|
log_update("Processed and combined mmseqs output. Wrote comprehensive results to clustering/mmseqs_full_results.csv") |
|
cluster_summary(clusters) |
|
|
|
if __name__ == "__main__": |
|
main() |