### Prepare to BLAST all of our sequences against UniProt import pandas as pd import os import subprocess import time import re import pickle import numpy as np from fuson_plm.utils.logging import log_update, open_logfile from fuson_plm.utils.embedding import redump_pickle_dictionary from fuson_plm.data.blast.plot import group_difference_plot, group_swiss_and_ht_plot, group_box_plot, group_pos_id_plot def prepare_blast_inputs(): log_update("\nPreparing BLAST Inputs. Logging every 1000 sequences... ") # make directory for input and output os.makedirs("blast_inputs", exist_ok=True) # read the fuson database fuson_db = pd.read_csv('../fuson_db.csv') # make dictionary mapping sequences to seqids (for naming input filess) fuson_db_dict = dict(zip(fuson_db['aa_seq'],fuson_db['seq_id'])) # convert the database into fasta format new_fa_files_created = 0 old_fa_files_found = 0 total_seqs_processed=0 for i, (seq, seqid) in enumerate(fuson_db_dict.items()): total_seqs_processed+=1 # if the path already exists, skip if os.path.exists(f"blast_inputs/{seqid}.fa"): old_fa_files_found+=1 else: new_fa_files_created+=1 with open(f"blast_inputs/{seqid}.txt", 'w') as f: fasta_lines = '>' + seqid + '\n' + seq f.write(fasta_lines) # rename it to .fa os.rename(f"blast_inputs/{seqid}.txt", f"blast_inputs/{seqid}.fa") if i%1000==0: log_update(f"\t\t{i}\t{seqid}:{seq}") log_update("\tFinished preparing BLAST Inputs (results in blast_inputs folder)") log_update(f"\t\tSequences processed: {total_seqs_processed}/{len(fuson_db)} seqs in FusOn-DB\n\t\tFasta files found: {old_fa_files_found}\n\t\tNew fasta files created: {new_fa_files_created}") def run_blast(blast_inputs_dir, database="swissprot",n=1,interval=2000): """ Run BLAST on all files in blast_inputs_dir """ # Must change the PATH variable to include the BLAST executables os.environ['PATH'] += ":./ncbi-blast-2.16.0+/bin" os.environ['BLASTDB'] = f"ncbi-blast-2.16.0+/{database}" # make directory for outputs os.makedirs("blast_outputs", exist_ok=True) os.makedirs(f"blast_outputs/{database}", exist_ok=True) already_blasted = os.listdir(f"blast_outputs/{database}") blast_input_files = os.listdir(blast_inputs_dir) # Sort the list using a custom key to extract the numeric part blast_input_files = sorted(blast_input_files, key=lambda x: int(re.search(r'\d+', x).group())) # print how many we've already blasted log_update(f"Running BLAST.\n\t{len(blast_input_files)} input files\n\t{len(already_blasted)} already blasted\n") tot_seqs_processed = 0 total_blast_time = 0 start_i = interval*(n-1) end_i = interval*n if end_i>len(blast_input_files): end_i = len(blast_input_files) for i, blast_input_file in enumerate(blast_input_files[start_i:end_i]): tot_seqs_processed+=1 # blast_input_file is of the format seqid.fa seqid = blast_input_file.split('.fa')[0] input_path = f"blast_inputs/{blast_input_file}" output_path = f"blast_outputs/{database}/{seqid}_{database}_results.out" if os.path.exists(output_path): log_update(f"\t{i+1}.\tAlready blasted {seqid}") continue # Construct the command as a list of arguments command = [ "ncbi-blast-2.16.0+/bin/blastp", "-db", database, "-query", input_path, "-out", output_path ] # Run the command, and time it blast_start_time = time.time() result = subprocess.run(command, capture_output=True, text=True) blast_end_time = time.time() blast_seq_time = blast_end_time-blast_start_time total_blast_time+=blast_seq_time # Check if there was an error if result.returncode != 0: log_update(f"\t{i+1}.\tError running BLAST for {seqid}: {result.stderr} ({blast_seq_time:.2f}s)") else: log_update(f"\t{i+1}.\tBLAST search completed for {seqid} ({blast_seq_time:.2f}s)") log_update(f"\tFinished processing {tot_seqs_processed} sequences ({total_blast_time:.2f}s)") def remove_incomplete_blasts(database="swissprot"): incomplete_list = [] for fname in os.listdir(f"blast_outputs/{database}"): complete=False with open(f"blast_outputs/{database}/{fname}", "r") as f: lines = f.readlines() if len(lines)>1 and "Window for multiple hits:" in lines[-1]: complete=True if not complete: incomplete_list.append(fname) log_update(f"\t{len(incomplete_list)} BLAST files are incomplete (due to BLAST errors). Deleting them. Rerun these") # remove all these files for fname in incomplete_list: os.remove(f"blast_outputs/{database}/{fname}") def find_nomatch_blasts(fuson_ht_db, database="swissprot"): no_match_list = [] for fname in os.listdir(f"blast_outputs/{database}"): match=True with open(f"blast_outputs/{database}/{fname}", "r") as f: lines = f.readlines() if len(lines)>1 and "No hits found" in lines[28]: # it'll say no hits found if there are no hits match=False if not match: no_match_list.append(fname) log_update(f"\t{len(no_match_list)} sequence IDs had no match in the BLAST database {database}") # write no match list to a file in blast_outputs with open(f"blast_outputs/{database}_no_match.txt","w") as f: for i, fname in enumerate(no_match_list): if i!=len(no_match_list)-1: f.write(f"{fname}\n") else: f.write(f"{fname}") # write a subset of fuson_ht_db containing these sequences as well no_match_ids = [x.split('_')[0] for x in no_match_list] subset = fuson_ht_db.loc[ fuson_ht_db['seq_id'].isin(no_match_ids) ].reset_index(drop=True) subset.to_csv(f"blast_outputs/{database}_no_match.csv",index=False) return no_match_ids def make_fuson_ht_db(path_to_fuson_db="../fuson_db.csv", path_to_unimap="../head_tail_data/htgenes_uniprotids.csv",savepath="fuson_ht_db.csv"): """ Make a version of the fuson_db that has all the heads and tails for each of the genes. Will make it easier to analyze blast results """ if os.path.exists(savepath): df = pd.read_csv(savepath) return df # read both of teh databases fuson_db = pd.read_csv(path_to_fuson_db) ht_db = pd.read_csv(path_to_unimap) # Make it such that each row of fuson_db just has ONE head and ONE tail fuson_ht_db = fuson_db.copy(deep=True) fuson_ht_db['fusiongenes'] = fuson_ht_db['fusiongenes'].apply(lambda x: x.split(',')) fuson_ht_db = fuson_ht_db.explode('fusiongenes') fuson_ht_db['hgene'] = fuson_ht_db['fusiongenes'].str.split('::',expand=True)[0] fuson_ht_db['tgene'] = fuson_ht_db['fusiongenes'].str.split('::',expand=True)[1] # Merge on head, then merge on tail fuson_ht_db = pd.merge( # merge on head fuson_ht_db, ht_db.rename(columns={ 'Gene': 'hgene', 'UniProtID': 'hgUniProt', 'Reviewed': 'hgUniProtReviewed' }), on='hgene', how='left' ) fuson_ht_db = pd.merge( # merge on tail fuson_ht_db, ht_db.rename(columns={ 'Gene': 'tgene', 'UniProtID': 'tgUniProt', 'Reviewed': 'tgUniProtReviewed' }), on='tgene', how='left' ) # Make sure we haven't lost anything tot_og_seqids = len(fuson_db['seq_id'].unique()) tot_final_seqids = len(fuson_ht_db['seq_id'].unique()) log_update(f"\tTotal sequence IDs in combined database = {tot_final_seqids}. Matches expected: {tot_final_seqids==tot_og_seqids}") # Each fusion should have the same number of ROWS as it does commas+1 fuson_db['n_commas'] = fuson_db['fusiongenes'].str.count(',') + 1 seqid_rows_map = dict(zip(fuson_db['seq_id'],fuson_db['n_commas'])) vc = fuson_ht_db['seq_id'].value_counts().reset_index() vc['expected_count'] = vc['index'].map(seqid_rows_map) log_update(f"\tEach seq_id has the expected number of head-tail combos: {(vc['expected_count']==vc['seq_id']).all()}") log_update(f"\tPreview of combined database:") prev = fuson_ht_db.head(10) prev['aa_seq'] = prev['aa_seq'].apply(lambda x: x[0:10]+'...') log_update(prev.to_string(index=False)) fuson_ht_db.to_csv(savepath, index=False) return fuson_ht_db def format_dict(d, indent=0): """ Recursively formats a dictionary for display purposes. Args: d (dict): The dictionary to format. indent (int): The current level of indentation. Returns: str: A formatted string representing the dictionary. """ formatted_str = "" # Iterate through each key-value pair in the dictionary for key, value in d.items(): # Create the current indentation current_indent = " " * (indent * 4) # Add the key formatted_str += f"{current_indent}{repr(key)}: " # Check the type of the value if isinstance(value, dict): # If dictionary, call format_dict recursively formatted_str += "{\n" + format_dict(value, indent + 1) + current_indent + "},\n" elif isinstance(value, list): # If list, convert it to a formatted string formatted_str += f"[{', '.join(repr(item) for item in value)}],\n" elif isinstance(value, str): # If string, enclose in quotes formatted_str += f"'{value}',\n" elif value is None: # If None, display as 'None' formatted_str += "None,\n" else: formatted_str += f"{repr(value)},\n" return formatted_str def parse_blast_output(file_path, head_ids, tail_ids): """ Args: - file_path: /path/to/blast/output - head_ids: list of all UniProt IDs for the head protien - tail_ids: list of all UniProt IDs for the tail protein """ target_ids = list(set(head_ids + tail_ids)) # make a list to make some functions easier with open(file_path, 'r') as file: best_data = {tid: None for tid in target_ids} # stores the best alignment for each ID we care about current_data = {tid: {} for tid in target_ids} # stores the current data for each ID we care about (most recent alignment we read) best_score = {tid: -float('inf') for tid in target_ids} # stores the best score for each ID we care about capture = {tid: False for tid in target_ids} # whether we are currently processing this ID replace_best = {tid: False for tid in target_ids} # whether we should replace the best_data with the current_data for this ID isoform_dict = {tid: None for tid in target_ids} # dictionary of isoforms for # variables that will only be used for getting the best alignment alignment_count = 0 cur_id = None on_best_alignment=False # Iterate through lines for line in file: line = line.strip() # if NEW ID (not necessarily new alignment! can be multiple alignmetns under one >) if line.startswith('>'): found_tid_in_header=False # assume we have not found a target ID we are looking for alignment_count+=1 if alignment_count==1: # we're on the best alignment because this is the one that's listed first! it should be on_best_alignment=True else: on_best_alignment = False ## We may have just finisehd processing an ID. Check for the one who currently has capture set to true just_captured = None total_captured = 0 for k, v in capture.items(): if v: total_captured+=1 just_captured = k # we should never be capturing more than one thing at a time. make sure of this assert total_captured<2 if just_captured is not None: if replace_best[just_captured]: # if we just finished an alignment for the just_captured ID, and it's the best one, put it in best_data[just_captured] = current_data[just_captured].copy() replace_best[just_captured] = False # we just did the replacement, so reset it # Check if the line contains any of the target IDs. # This means EITHER [UniProtID] or [UniProtID.Isoform] or [UniProtID-Isoform] is in the line for tid in target_ids: pattern = fr">{tid}([.-]\d+)? " # for ID P02671, would match ">P02671 ", ">P02671.2 " and ">P02671-2 " if re.search(pattern, line): # if this ID matches isoform_dict[tid] = None # set it to None, update it if we need to if "." in line: # look for isoform denoted by . if there is one, otherwise it'll stay as None isoform = int(line.split(".")[1].split(" ")[0]) isoform_dict[tid] = isoform #print(f"\t\tID = {tid} (is a head or tail), isoform={isoform}") elif "-" in line: # look for isoform denoted by - if there is one, otherwise it'll stay as None isoform = int(line.split("-")[1].split(" ")[0]) isoform_dict[tid] = isoform #print(f"\t\tID = {tid} (is a head or tail), isoform={isoform}") capture[tid] = True current_data[tid] = {'header': line} found_tid_in_header=True # we've found the tid that's in this line, so no need to check theothers else: capture[tid] = False if on_best_alignment: # if this is the best alignment if not(found_tid_in_header): # if none of our TIDs are it cur_id_full = line.split('>')[1].split(' ')[0] cur_id, isoform = cur_id_full, None isoform_dict[cur_id] = None # change this if we need if "." in cur_id_full: # if there's a dot, it's an isoform. cur_id = cur_id_full.split(".")[0] isoform = int(cur_id_full.split(".")[1]) isoform_dict[cur_id] = isoform #log_update(f"\t\tID = {cur_id} (best alignment, not a head or tail), isoform={isoform}") #log_update(f"\t\t\tFull line: {line}") # so we can see the gene name. does it make sense? elif "-" in cur_id_full: # if there's a -, it's an isoform. cur_id = cur_id_full.split("-")[0] isoform = int(cur_id_full.split("-")[1]) isoform_dict[cur_id] = isoform #log_update(f"\t\tID = {cur_id} (best alignment, not a head or tail), isoform={isoform}") #log_update(f"\t\t\tFull line: {line}") # so we can see the gene name. does it make sense? # add this id to all the dictionaries best_data[cur_id] = None current_data[cur_id] = {} best_score[cur_id] = -float('inf') capture[cur_id] = False replace_best[cur_id] = False for tid in target_ids: if capture[tid]: # if we're currently on an alignment for a tid we care about if 'Score =' in line: if replace_best[tid]: # if we're replacing the best alignment with this one, within the same ID, do it best_data[tid] = current_data[tid].copy() # now reset the variable! replace_best[tid] = False score_value = float(line.split()[2]) # Assuming "Score = 1053 bits (2723)" format current_data[tid] = {} # Reset current_data for this ID current_data[tid]['Isoform'] = isoform_dict[tid] current_data[tid]['Score'] = score_value current_data[tid]['Expect'] = line.split('Expect =')[1].split(', Method')[0].strip() current_data[tid]['Query_Aligned'] = [] current_data[tid]['Subject_Aligned'] = [] # Set the ID as a head or tail, or neither (neither shouldn't happen here though) if tid in head_ids: current_data[tid]['H_or_T'] = 'Head' if tid in tail_ids: current_data[tid]['H_or_T'] = 'Head,Tail' elif tid in tail_ids: current_data[tid]['H_or_T'] = 'Tail' else: current_data[tid]['H_or_T'] = np.nan current_data[tid]['Best'] = True if on_best_alignment else False if score_value > best_score[tid]: # if this is the best score we have for an alignment of this protein best_score[tid] = score_value replace_best[tid] = True else: replace_best[tid] = False if 'Identities =' in line: idents = line.split(', ') current_data[tid]['Identities'] = idents[0].split('=')[1].strip() current_data[tid]['Positives'] = idents[1].split('=')[1].strip() current_data[tid]['Gaps'] = idents[2].split('=')[1].strip() if line.startswith('Query'): parts = line.split() if 'Query_Start' not in current_data[tid]: current_data[tid]['Query_Start'] = int(parts[1]) current_data[tid]['Query_End'] = int(parts[3]) current_data[tid]['Query_Aligned'].append(parts[2]) if line.startswith('Sbjct'): parts = line.split() if 'Sbjct_Start' not in current_data[tid]: current_data[tid]['Sbjct_Start'] = int(parts[1]) current_data[tid]['Sbjct_End'] = int(parts[3]) current_data[tid]['Subject_Aligned'].append(parts[2]) # if we're on the best alignment and it's not one of our target_ids, still process it the same way if on_best_alignment: if not(found_tid_in_header): if 'Score =' in line: if replace_best[cur_id]: # if we're replacing the best alignment with this one, within the same ID, do it best_data[cur_id] = current_data[cur_id].copy() # now reset the variable! replace_best[cur_id] = False score_value = float(line.split()[2]) # Assuming "Score = 1053 bits (2723)" format current_data[cur_id] = {} # Reset current_data for this ID current_data[cur_id]['Isoform'] = isoform_dict[cur_id] current_data[cur_id]['Score'] = score_value current_data[cur_id]['Expect'] = line.split('Expect =')[1].split(', Method')[0].strip() current_data[cur_id]['Query_Aligned'] = [] current_data[cur_id]['Subject_Aligned'] = [] # Set the ID as a head or tail, or neither if cur_id in head_ids: current_data[cur_id]['H_or_T'] = 'Head' if cur_id in tail_ids: current_data[cur_id]['H_or_T'] = 'Head,Tail' elif cur_id in tail_ids: current_data[cur_id]['H_or_T'] = 'Tail' else: current_data[cur_id]['H_or_T'] = np.nan current_data[cur_id]['Best'] = True if score_value > best_score[cur_id]: # if this is the best score we have for an alignment of this protein best_score[cur_id] = score_value replace_best[cur_id] = True else: replace_best[cur_id] = False if 'Identities =' in line: idents = line.split(', ') current_data[cur_id]['Identities'] = idents[0].split('=')[1].strip() current_data[cur_id]['Positives'] = idents[1].split('=')[1].strip() current_data[cur_id]['Gaps'] = idents[2].split('=')[1].strip() if line.startswith('Query'): parts = line.split() if 'Query_Start' not in current_data[cur_id]: current_data[cur_id]['Query_Start'] = int(parts[1]) current_data[cur_id]['Query_End'] = int(parts[3]) current_data[cur_id]['Query_Aligned'].append(parts[2]) if line.startswith('Sbjct'): parts = line.split() if 'Sbjct_Start' not in current_data[cur_id]: current_data[cur_id]['Sbjct_Start'] = int(parts[1]) current_data[cur_id]['Sbjct_End'] = int(parts[3]) current_data[cur_id]['Subject_Aligned'].append(parts[2]) # add cur_id to target_ids if it's not none if not(cur_id is None): target_ids += [cur_id] # Check at the end of the file if the last scores are the best for tid in target_ids: if replace_best[tid]: best_data[tid] = current_data[tid].copy() # Combine sequences into single strings for the best data for each ID for tid in target_ids: #print(tid) if best_data[tid]: #print(f"there is a best alignment for {tid}") #print(f"best: {best_data[tid]}") #print(f"current: {current_data[tid]}") best_data[tid]['Query_Aligned'] = ''.join(best_data[tid]['Query_Aligned']) best_data[tid]['Subject_Aligned'] = ''.join(best_data[tid]['Subject_Aligned']) return best_data def parse_all_blast_results(fuson_ht_db, database="swissprot"): """ Analyze the BLAST outputs for each fusion protein against UniProt. Use the fuson_ht_db to look for the heads and tails that we expect. If they can't be found, ... ? """ output_file=f"blast_outputs/{database}_blast_output_analyzed.pkl" all_seq_ids = fuson_ht_db['seq_id'].unique().tolist() all_seq_ids = sorted(all_seq_ids, key=lambda x: int(re.search(r'\d+', x).group())) # sort by the number. seq1, seq2, ... prior_results = {} if os.path.exists(output_file): with open(output_file, "rb") as f: prior_results = pickle.load(f) # Iterate through seq_ids total_parse_time = 0 tot_seqs_processed = 0 for seq_id in all_seq_ids: try: tot_seqs_processed+=1 # If we've already processed it, skip if seq_id in prior_results: log_update(f"\tAlready processed {seq_id} blast results. Continuing") continue file_path = f"blast_outputs/{database}/{seq_id}_{database}_results.out" aa_seq = fuson_ht_db.loc[ fuson_ht_db['seq_id']==seq_id ]['aa_seq'].tolist()[0] # Remember, fuson_ht_db has all the IDs for ALL the different head and tail gene identifiers. fusion_genes = fuson_ht_db.loc[ fuson_ht_db['seq_id']==seq_id ]['fusiongenes'].tolist() ##### Process heads head_ids = fuson_ht_db.loc[ fuson_ht_db['seq_id']==seq_id ]['hgUniProt'].dropna().tolist() head_reviewed, head_reviewed_dict = "", {} if len(head_ids)>0: # if we found head IDs, we can process them and figure out if they're reviewed head_ids = ",".join(head_ids).split(",") head_reviewed = fuson_ht_db.loc[ fuson_ht_db['seq_id']==seq_id ]['hgUniProtReviewed'].dropna().tolist() head_reviewed = list("".join(head_reviewed)) head_reviewed_dict = dict(zip(head_ids, head_reviewed)) head_ids = list(head_reviewed_dict.keys()) # there may be some duplicates, so separate them out again head_reviewed = list(head_reviewed_dict.values()) head_genes = fuson_ht_db.loc[ fuson_ht_db['seq_id']==seq_id ]['hgene'].unique().tolist() ##### Process tails - same logic tail_ids = fuson_ht_db.loc[ fuson_ht_db['seq_id']==seq_id ]['tgUniProt'].dropna().tolist() tail_reviewed, tail_reviewed_dict = "", {} if len(tail_ids)>0: # if we found tail IDs, we can process them and figure out if they're reviewed tail_ids = ",".join(tail_ids).split(",") tail_reviewed = fuson_ht_db.loc[ fuson_ht_db['seq_id']==seq_id ]['tgUniProtReviewed'].dropna().tolist() tail_reviewed = list("".join(tail_reviewed)) tail_reviewed_dict = dict(zip(tail_ids, tail_reviewed)) tail_ids = list(tail_reviewed_dict.keys()) # there may be some duplicates, so separate them out again tail_reviewed = list(tail_reviewed_dict.values()) tail_genes = fuson_ht_db.loc[ fuson_ht_db['seq_id']==seq_id ]['tgene'].unique().tolist() ###### Log what we just found log_update(f"\tEvaluating {seq_id}, fusion genes = {fusion_genes}, len = {len(aa_seq)}...\n\t\tfile_path={file_path}") #log_update(f"\n\t\thead genes={head_genes}\n\t\thead_ids={head_ids}\n\t\ttail genes={tail_genes}\n\t\ttail_ids={tail_ids}") ### Do the analysis and time it parse_start_time = time.time() # time it blast_data = parse_blast_output(file_path, head_ids, tail_ids) parse_end_time = time.time() parse_seq_time = parse_end_time-parse_start_time total_parse_time+=parse_seq_time log_update(f"\t\tBLAST output analysis completed for {seq_id} ({parse_seq_time:.2f}s)") # Give preview of results. Logging the whole dict would be too much, so let's just see what we found #log_update(format_dict(blast_data,indent=3)) n_og_reviewed_head_ids = len([x for x in head_reviewed if x=='1']) found_head_ids = [x for x in list(blast_data.keys()) if (blast_data[x] is not None) and (blast_data[x].get('H_or_T',None) in ['Head','Head,Tail'])] n_found_reviewed_head_ids = len([x for x in found_head_ids if head_reviewed_dict[x]=='1']) n_og_reviewed_tail_ids = len([x for x in tail_reviewed if x=='1']) found_tail_ids = [x for x in list(blast_data.keys()) if (blast_data[x] is not None) and (blast_data[x].get('H_or_T',None) in ['Tail','Head,Tail'])] n_found_reviewed_tail_ids = len([x for x in found_tail_ids if tail_reviewed_dict[x]=='1']) #log_update(f"\t\t{len(found_head_ids)}/{len(head_ids)} head protein UniProt IDs ({n_found_reviewed_head_ids}/{n_og_reviewed_head_ids} REVIEWED heads) had alignments") #log_update(f"\t\t{len(found_tail_ids)}/{len(tail_ids)} tail protein UniProt IDs ({n_found_reviewed_tail_ids}/{n_og_reviewed_tail_ids} REVIEWED tails) had alignments") # write results to pickle file to_pickle_dict = {seq_id: blast_data} with open(output_file, 'ab+') as f: pickle.dump(to_pickle_dict, f) except: log_update(f"{seq_id} failed") # redump the pickle even if we hit an error, so that we can fix the error and continue processing results redump_pickle_dictionary(output_file) # Log total time log_update(f"\tFinished processing {tot_seqs_processed} sequences ({total_parse_time:.2f}s)") # redump the pickle redump_pickle_dictionary(output_file) def analyze_blast_results(fuson_ht_db, database="swissprot"): blast_results_path=f"blast_outputs/{database}_blast_output_analyzed.pkl" stats_df_savepath = f"blast_outputs/{database}_blast_stats.csv" top_alignments_df_savepath = f"blast_outputs/{database}_top_alignments.csv" stats_df, top_alignments_df = None, None if os.path.exists(stats_df_savepath) and os.path.exists(top_alignments_df_savepath): stats_df = pd.read_csv(stats_df_savepath) top_alignments_df = pd.read_csv(top_alignments_df_savepath, dtype={'top_hg_UniProt_isoform':'str', 'top_tg_UniProt_isoform': 'str', 'top_UniProt_isoform': 'str'}) else: with open(blast_results_path, "rb") as f: results = pickle.load(f) # analyze the results # first, basic stats. How many of them have at least one head or tail alignment?? seqid_stats = {} top_alignments_dict = {} for seq_id in list(results.keys()): seqid_stats[seq_id] = { 'hgAlignments': 0, 'tgAlignments': 0, 'totalAlignments': 0, 'best_hgScore': 0, 'best_tgScore': 0, 'best_Score': 0 } top_alignments_dict[seq_id] = { 'top_hg_UniProtID': None, 'top_hg_UniProt_isoform': None, 'top_hg_UniProt_fus_indices': None, 'top_tg_UniProtID': None, 'top_tg_UniProt_isoform': None, 'top_tg_UniProt_fus_indices': None, 'top_UniProtID': None, 'top_UniProt_isoform': None, 'top_UniProt_fus_indices': None } for uniprot, d in results[seq_id].items(): if not(d is None): isoform = d['Isoform'] # set up the indices string query_start = d['Query_Start'] if (query_start is None) or (type(query_start)==float and np.isnan(query_start)): query_start = '' else: query_start = int(query_start) query_end = d['Query_End'] if (query_end is None) or (type(query_end)==float and np.isnan(query_end)): query_end = '' else: query_end = int(query_end) fus_indices = f"{query_start},{query_end}".strip(",") if d['H_or_T'] in ['Head', 'Head,Tail']: seqid_stats[seq_id]['hgAlignments'] +=1 if d['Score'] > seqid_stats[seq_id]['best_hgScore']: seqid_stats[seq_id]['best_hgScore'] = d['Score'] if type(uniprot)==float or uniprot is None: top_alignments_dict[seq_id]['top_hg_UniProtID'] = '' else: top_alignments_dict[seq_id]['top_hg_UniProtID'] = uniprot if (type(isoform)==float and np.isnan(isoform)) or isoform is None: top_alignments_dict[seq_id]['top_hg_UniProt_isoform'] = '' else: top_alignments_dict[seq_id]['top_hg_UniProt_isoform'] = str(int(isoform)) top_alignments_dict[seq_id]['top_hg_UniProt_fus_indices'] = fus_indices if d['H_or_T'] in ['Tail','Head,Tail']: seqid_stats[seq_id]['tgAlignments'] +=1 if d['Score'] > seqid_stats[seq_id]['best_tgScore']: seqid_stats[seq_id]['best_tgScore'] = d['Score'] if type(uniprot)==float or uniprot is None: top_alignments_dict[seq_id]['top_tg_UniProtID'] = '' else: top_alignments_dict[seq_id]['top_tg_UniProtID'] = uniprot if (type(isoform)==float and np.isnan(isoform)) or isoform is None: top_alignments_dict[seq_id]['top_tg_UniProt_isoform'] = '' else: top_alignments_dict[seq_id]['top_tg_UniProt_isoform'] = str(int(isoform)) top_alignments_dict[seq_id]['top_tg_UniProt_fus_indices'] = fus_indices # increment total no matter what type of alignment it is seqid_stats[seq_id]['totalAlignments']+=1 #if d['Score'] > seqid_stats[seq_id]['best_Score']: if d['Best']==True: # should be indicated if this is the best!! seqid_stats[seq_id]['best_Score'] = d['Score'] if type(uniprot)==float or uniprot is None: top_alignments_dict[seq_id]['top_UniProtID'] = '' else: top_alignments_dict[seq_id]['top_UniProtID'] = uniprot if (type(isoform)==float and np.isnan(isoform)) or isoform is None: top_alignments_dict[seq_id]['top_UniProt_isoform'] = '' else: top_alignments_dict[seq_id]['top_UniProt_isoform'] = str(int(isoform)) top_alignments_dict[seq_id]['top_UniProt_fus_indices'] = fus_indices # now get positives and identities if 'Identities' not in d: print(seq_id, uniprot, d.keys()) identities = d['Identities'] identities = int(identities.split('/')[0]) positives = d['Positives'] positives = int(positives.split('/')[0]) top_alignments_dict[seq_id]['top_UniProt_nIdentities'] = identities top_alignments_dict[seq_id]['top_UniProt_nPositives'] = positives stats_df = pd.DataFrame.from_dict(seqid_stats, orient='index').reset_index().rename(columns={'index':'seq_id'}) stats_df['h_or_t_alignment'] = stats_df.apply(lambda row: True if (row['hgAlignments']>0 or row['tgAlignments']>0) else False, axis=1) stats_df['h_and_t_alignment'] = stats_df.apply(lambda row: True if (row['hgAlignments']>0 and row['tgAlignments']>0) else False, axis=1) stats_df.to_csv(stats_df_savepath,index=False) top_alignments_df = pd.DataFrame.from_dict(top_alignments_dict, orient='index').reset_index().rename(columns={'index':'seq_id'}) # add in the sequence length so we can get percentages fusion_id_seq_dict = dict(zip(fuson_ht_db['seq_id'],fuson_ht_db['aa_seq'])) assert len(fusion_id_seq_dict) == len(fuson_ht_db['seq_id'].unique()) == len(fuson_ht_db['aa_seq'].unique()) top_alignments_df['aa_seq_len'] = top_alignments_df['seq_id'].map(fusion_id_seq_dict).str.len() top_alignments_df.to_csv(top_alignments_df_savepath,index=False) # also, find which ones have no match at all # does it match? no_match_list1 = find_nomatch_blasts(fuson_ht_db, database=database) log_update(stats_df.head(10).to_string()) # how many have at least one head or tail? log_update(f"Total sequences: {len(stats_df)}") log_update(f"Sequences with >=1 head alignment: {len(stats_df.loc[stats_df['hgAlignments']>0])}") log_update(f"Sequences with >=1 tail alignment: {len(stats_df.loc[stats_df['tgAlignments']>0])}") log_update(f"Sequences with >=1 head OR tail alignment: {len(stats_df.loc[stats_df['h_or_t_alignment']])}") log_update(f"Sequences with >=1 head AND tail alignment: {len(stats_df.loc[stats_df['h_and_t_alignment']])}") log_update(f"Sequences with ANY alignment: {len(stats_df.loc[stats_df['totalAlignments']>0])}") top_alignments_df = top_alignments_df.replace({None: ''}) log_update(f"Preview of top alignments for {database} search:\n{top_alignments_df.head(10).to_string(index=False)}") top_alignments_df['hiso'] = top_alignments_df['top_hg_UniProtID']+'-'+top_alignments_df['top_hg_UniProt_isoform'] top_alignments_df['tiso'] = top_alignments_df['top_tg_UniProtID']+'-'+top_alignments_df['top_tg_UniProt_isoform'] top_alignments_df['biso'] = top_alignments_df['top_UniProtID']+'-'+top_alignments_df['top_UniProt_isoform'] top_hgs = set([x.strip('-') for x in top_alignments_df['hiso'].tolist()]) # if things don't have isoforms they'll just end in - top_tgs = set([x.strip('-') for x in top_alignments_df['tiso'].tolist()]) top_bgs = set([x.strip('-') for x in top_alignments_df['biso'].tolist()]) top_gs = top_hgs | top_tgs | top_bgs log_update(f"\nTotal unique head proteins (including isoform) producing top head alignments: {len(top_hgs)}") log_update(f"\nTotal unique tail proteins (including isoform) producing top tail alignments: {len(top_tgs)}") log_update(f"\nTotal unique proteins (including isoform) - head, tail, or neither - producing top alignments: {len(top_gs)}") return stats_df, top_alignments_df def compare_database_blasts(fuson_ht_db, swissprot_blast_stats, fusion_hts_blast_stats, make_new_plots=True): # let's start by just returning a list of IDs that were # cols = seq_id hgAlignments tgAlignments totalAlignments best_hgScore best_tgScore best_Score h_or_t_alignment h_and_t_alignment # distinguish the columns og_cols = list(swissprot_blast_stats.columns)[1::] for c in og_cols: if c!='seq_id': swissprot_blast_stats = swissprot_blast_stats.rename(columns={c: f"swiss_{c}"}) for c in og_cols: if c!='seq_id': fusion_hts_blast_stats = fusion_hts_blast_stats.rename(columns={c: f"hts_{c}"}) # merge merged = pd.merge(swissprot_blast_stats, fusion_hts_blast_stats, on='seq_id', how='outer') diff_cols = og_cols[0:-2] differences = pd.DataFrame(columns=diff_cols) log_update(f"Making volcano plots of the differences between fusion head-tail BLAST and swissprot BLAST in the following columns:\n\t{','.join(diff_cols)}") for c in diff_cols: differences[c] = merged[f"hts_{c}"] - merged[f"swiss_{c}"] # make some box plots of differences # Generate volcano plots for each column if make_new_plots: os.makedirs("figures",exist_ok=True) os.makedirs("figures/database_comparison",exist_ok=True) os.makedirs("figures/database_comparison/differences",exist_ok=True) os.makedirs("figures/database_comparison/values",exist_ok=True) os.makedirs("figures/database_comparison/box",exist_ok=True) group_difference_plot(differences) group_swiss_and_ht_plot(merged.drop(columns=['seq_id']), diff_cols) group_box_plot(merged.drop(columns=['seq_id']), diff_cols) def fasta_to_dataframe(fasta_file): # Read the file into a DataFrame with a single column df = pd.read_fwf(fasta_file, header=None, colspecs=[(0, None)], names=['content']) # Select even and odd lines using pandas slicing ids = df.iloc[::2].reset_index(drop=True) # Even-indexed lines (IDs) sequences = df.iloc[1::2].reset_index(drop=True) # Odd-indexed lines (sequences) # Combine into a new DataFrame fasta_df = pd.DataFrame({'ID': ids['content'], 'Sequence': sequences['content']}) fasta_df['ID'] = fasta_df['ID'].str.split('>',expand=True)[1] fasta_df['Sequence'] = fasta_df['Sequence'].str.strip().str.strip('\n') # print a preview of this temp = fasta_df.head(10) temp['Sequence'] = temp['Sequence'].apply(lambda x: x[0:10]+'...') log_update(f"Preview of head/tail fasta sequences in a dataframe:\n{temp.to_string(index=False)}") return fasta_df def get_ht_uniprot_query(swissprot_top_alignments_df): ''' Use swissprot_top_alignments_df to curate all the unique UniProt IDs (ID.Isoform) that created top head and tail alignments ''' swissprot_top_alignments_df['top_hg_full'] = swissprot_top_alignments_df['top_hg_UniProtID']+'.'+swissprot_top_alignments_df['top_hg_UniProt_isoform'] swissprot_top_alignments_df['top_tg_full'] = swissprot_top_alignments_df['top_tg_UniProtID']+'.'+swissprot_top_alignments_df['top_tg_UniProt_isoform'] unique_heads = swissprot_top_alignments_df.loc[ swissprot_top_alignments_df['top_hg_UniProtID'].notna() ]['top_hg_full'].unique().tolist() unique_tails = swissprot_top_alignments_df.loc[ swissprot_top_alignments_df['top_tg_UniProtID'].notna() ]['top_tg_full'].unique().tolist() unique_ht = set(unique_heads).union(set(unique_tails)) unique_ht = list(unique_ht) unique_ht = [x for x in unique_ht if len(x)>1] # not just "." with open("blast_outputs/ht_uniprot_query.txt", "w") as f: for i, ht in enumerate(unique_ht): if i!= len(unique_ht)-1: f.write(f"{ht}\n") else: f.write(f"{ht}") def main(): # Later, add the argparse thing back in here and change where the log is and what happens depending on wht the user decides # May need to separate blast prep from actual blast for the manuscript, but worry about this later with open_logfile(f"fusion_blast_log.txt"): # Start by preparing BLAST inputs prepare_blast_inputs() # Then run BLAST run_blast("blast_inputs",database="swissprot") ###### Analyze BLAST results # Make database with head and tail info for each fusion, so we know what to expect fuson_ht_db = make_fuson_ht_db(savepath="fuson_ht_db.csv") #parse_all_blast_results(fuson_ht_db, database="swissprot") swissprot_blast_stats, swissprot_top_alignments_df = analyze_blast_results(fuson_ht_db,database="swissprot") swissprot_top_alignments_df = pd.read_csv("blast_outputs/swissprot_top_alignments.csv") get_ht_uniprot_query(swissprot_top_alignments_df) os.makedirs("figures/top_blast_visuals",exist_ok=True) group_pos_id_plot(swissprot_top_alignments_df) if __name__ == '__main__': main()