Fill-Mask
Transformers
Safetensors
esm
File size: 4,970 Bytes
e048d40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from fuson_plm.utils.logging import open_logfile, log_update
from fuson_plm.benchmarking.idr_prediction.config import CLUSTER
from fuson_plm.utils.clustering import ensure_mmseqs_in_path, process_fasta, analyze_clustering_result, make_fasta, run_mmseqs_clustering, cluster_summary
import os
import pandas as pd

def main_old():
    # Read all the input args
    LOG_PATH = "clustering_log.txt"
    INPUT_PATH = CLUSTER.INPUT_PATH
    MIN_SEQ_ID = CLUSTER.MIN_SEQ_ID
    C = CLUSTER.C
    COV_MODE = CLUSTER.COV_MODE
    CLUSTER_MODE = CLUSTER.CLUSTER_MODE
    PATH_TO_MMSEQS = CLUSTER.PATH_TO_MMSEQS
    
    with open_logfile(LOG_PATH):
        log_update("Input params from config.py:")
        CLUSTER.print_config(indent='\t')
        # Make a subfolder for clustering results, and direct MMSeqs2 outputs here
        os.makedirs("clustering",exist_ok=True)
        output_dir = "clustering/raw_output"
        
        # Make fasta of input file
        sequences = pd.read_csv(INPUT_PATH)
        # We only want to cluster the ones in the Train split from Albatross
        sequences = sequences.loc[sequences['Split']=='Train'].reset_index(drop=True)
        log_update(f"\nPreparing input data (albatross TRAIN only)...\n\tdataset size: {len(sequences)} sequences")
        
        max_seqlen = max(sequences['Sequence'].str.len().tolist())
        log_update(f"\tLongest sequence in dataset: {max_seqlen} AAs")
        
        # Unfortunately, these IDs are NOT unique. Need to add tags to them
        sequences['Unique_ID'] = [f"s{i+1}" for i in range(len(sequences))]
        sequences['Unique_ID'] = sequences["IDs"].apply(lambda x: "_".join(x.split(','))) + "_" + sequences['Unique_ID']
        log_update("Not all IDs from the database are unique. Created unique IDs by tagging on sequence #s")
        log_update(f"\tExample: {sequences.iloc[0]['Unique_ID']}")
        sequences = dict(zip(sequences['Unique_ID'],sequences['Sequence']))
        fasta_path = make_fasta(sequences, "clustering/input.fasta")
        log_update(f"\tMade fasta of input sequences, saved at {fasta_path}")
        
        run_mmseqs_clustering(fasta_path, output_dir, min_seq_id=MIN_SEQ_ID, c=C, cov_mode=COV_MODE, cluster_mode=CLUSTER_MODE, path_to_mmseqs=PATH_TO_MMSEQS)
        
        # Brief read to preview results
        clusters = analyze_clustering_result('clustering/input.fasta', 'clustering/raw_output/mmseqs_cluster.tsv')
        # Save clusters
        clusters.to_csv('clustering/mmseqs_full_results.csv',index=False)
        log_update("Processed and combined mmseqs output. Wrote comprehensive results to clustering/mmseqs_full_results.csv")
        cluster_summary(clusters)
        
def main():
    # Read all the input args
    LOG_PATH = "clustering_log.txt"
    INPUT_PATH = CLUSTER.INPUT_PATH
    MIN_SEQ_ID = CLUSTER.MIN_SEQ_ID
    C = CLUSTER.C
    COV_MODE = CLUSTER.COV_MODE
    CLUSTER_MODE = CLUSTER.CLUSTER_MODE
    PATH_TO_MMSEQS = CLUSTER.PATH_TO_MMSEQS
    
    with open_logfile(LOG_PATH):
        log_update("Input params from config.py:")
        CLUSTER.print_config(indent='\t')
        # Make a subfolder for clustering results, and direct MMSeqs2 outputs here
        os.makedirs("clustering",exist_ok=True)
        output_dir = "clustering/raw_output"
        
        # Make fasta of input file
        sequences = pd.read_csv(INPUT_PATH)
        log_update(f"\nPreparing input data (albatross train AND test sequences)...\n\tdataset size: {len(sequences)} sequences")
        
        max_seqlen = max(sequences['Sequence'].str.len().tolist())
        log_update(f"\tLongest sequence in dataset: {max_seqlen} AAs")
        
        # Unfortunately, these IDs are NOT unique. Need to add tags to them
        sequences['Unique_ID'] = [f"s{i+1}" for i in range(len(sequences))]
        sequences['Unique_ID'] = sequences["IDs"].apply(lambda x: "_".join(x.split(','))) + "_" + sequences['Unique_ID']
        log_update("Not all IDs from the database are unique. Created unique IDs by tagging on sequence #s")
        log_update(f"\tExample: {sequences.iloc[0]['Unique_ID']}")
        sequences = dict(zip(sequences['Unique_ID'],sequences['Sequence']))
        fasta_path = make_fasta(sequences, "clustering/input.fasta")
        log_update(f"\tMade fasta of input sequences, saved at {fasta_path}")
        
        run_mmseqs_clustering(fasta_path, output_dir, min_seq_id=MIN_SEQ_ID, c=C, cov_mode=COV_MODE, cluster_mode=CLUSTER_MODE, path_to_mmseqs=PATH_TO_MMSEQS)
        
        # Brief read to preview results
        clusters = analyze_clustering_result('clustering/input.fasta', 'clustering/raw_output/mmseqs_cluster.tsv')
        # Save clusters
        clusters.to_csv('clustering/mmseqs_full_results.csv',index=False)
        log_update("Processed and combined mmseqs output. Wrote comprehensive results to clustering/mmseqs_full_results.csv")
        cluster_summary(clusters)
    
if __name__ == "__main__":
    main()