Fill-Mask
Transformers
Safetensors
esm
FusOn-pLM / fuson_plm /data /blast /blast_fusions.py
root
uploading data folder
1e6a1f0
### Prepare to BLAST all of our sequences against UniProt
import pandas as pd
import os
import subprocess
import time
import re
import pickle
import numpy as np
from fuson_plm.utils.logging import log_update, open_logfile
from fuson_plm.utils.embedding import redump_pickle_dictionary
from fuson_plm.data.blast.plot import group_difference_plot, group_swiss_and_ht_plot, group_box_plot, group_pos_id_plot
def prepare_blast_inputs():
log_update("\nPreparing BLAST Inputs. Logging every 1000 sequences... ")
# make directory for input and output
os.makedirs("blast_inputs", exist_ok=True)
# read the fuson database
fuson_db = pd.read_csv('../fuson_db.csv')
# make dictionary mapping sequences to seqids (for naming input filess)
fuson_db_dict = dict(zip(fuson_db['aa_seq'],fuson_db['seq_id']))
# convert the database into fasta format
new_fa_files_created = 0
old_fa_files_found = 0
total_seqs_processed=0
for i, (seq, seqid) in enumerate(fuson_db_dict.items()):
total_seqs_processed+=1
# if the path already exists, skip
if os.path.exists(f"blast_inputs/{seqid}.fa"):
old_fa_files_found+=1
else:
new_fa_files_created+=1
with open(f"blast_inputs/{seqid}.txt", 'w') as f:
fasta_lines = '>' + seqid + '\n' + seq
f.write(fasta_lines)
# rename it to .fa
os.rename(f"blast_inputs/{seqid}.txt", f"blast_inputs/{seqid}.fa")
if i%1000==0:
log_update(f"\t\t{i}\t{seqid}:{seq}")
log_update("\tFinished preparing BLAST Inputs (results in blast_inputs folder)")
log_update(f"\t\tSequences processed: {total_seqs_processed}/{len(fuson_db)} seqs in FusOn-DB\n\t\tFasta files found: {old_fa_files_found}\n\t\tNew fasta files created: {new_fa_files_created}")
def run_blast(blast_inputs_dir, database="swissprot",n=1,interval=2000):
"""
Run BLAST on all files in blast_inputs_dir
"""
# Must change the PATH variable to include the BLAST executables
os.environ['PATH'] += ":./ncbi-blast-2.16.0+/bin"
os.environ['BLASTDB'] = f"ncbi-blast-2.16.0+/{database}"
# make directory for outputs
os.makedirs("blast_outputs", exist_ok=True)
os.makedirs(f"blast_outputs/{database}", exist_ok=True)
already_blasted = os.listdir(f"blast_outputs/{database}")
blast_input_files = os.listdir(blast_inputs_dir)
# Sort the list using a custom key to extract the numeric part
blast_input_files = sorted(blast_input_files, key=lambda x: int(re.search(r'\d+', x).group()))
# print how many we've already blasted
log_update(f"Running BLAST.\n\t{len(blast_input_files)} input files\n\t{len(already_blasted)} already blasted\n")
tot_seqs_processed = 0
total_blast_time = 0
start_i = interval*(n-1)
end_i = interval*n
if end_i>len(blast_input_files): end_i = len(blast_input_files)
for i, blast_input_file in enumerate(blast_input_files[start_i:end_i]):
tot_seqs_processed+=1
# blast_input_file is of the format seqid.fa
seqid = blast_input_file.split('.fa')[0]
input_path = f"blast_inputs/{blast_input_file}"
output_path = f"blast_outputs/{database}/{seqid}_{database}_results.out"
if os.path.exists(output_path):
log_update(f"\t{i+1}.\tAlready blasted {seqid}")
continue
# Construct the command as a list of arguments
command = [
"ncbi-blast-2.16.0+/bin/blastp",
"-db", database,
"-query", input_path,
"-out", output_path
]
# Run the command, and time it
blast_start_time = time.time()
result = subprocess.run(command, capture_output=True, text=True)
blast_end_time = time.time()
blast_seq_time = blast_end_time-blast_start_time
total_blast_time+=blast_seq_time
# Check if there was an error
if result.returncode != 0:
log_update(f"\t{i+1}.\tError running BLAST for {seqid}: {result.stderr} ({blast_seq_time:.2f}s)")
else:
log_update(f"\t{i+1}.\tBLAST search completed for {seqid} ({blast_seq_time:.2f}s)")
log_update(f"\tFinished processing {tot_seqs_processed} sequences ({total_blast_time:.2f}s)")
def remove_incomplete_blasts(database="swissprot"):
incomplete_list = []
for fname in os.listdir(f"blast_outputs/{database}"):
complete=False
with open(f"blast_outputs/{database}/{fname}", "r") as f:
lines = f.readlines()
if len(lines)>1 and "Window for multiple hits:" in lines[-1]:
complete=True
if not complete:
incomplete_list.append(fname)
log_update(f"\t{len(incomplete_list)} BLAST files are incomplete (due to BLAST errors). Deleting them. Rerun these")
# remove all these files
for fname in incomplete_list:
os.remove(f"blast_outputs/{database}/{fname}")
def find_nomatch_blasts(fuson_ht_db, database="swissprot"):
no_match_list = []
for fname in os.listdir(f"blast_outputs/{database}"):
match=True
with open(f"blast_outputs/{database}/{fname}", "r") as f:
lines = f.readlines()
if len(lines)>1 and "No hits found" in lines[28]: # it'll say no hits found if there are no hits
match=False
if not match:
no_match_list.append(fname)
log_update(f"\t{len(no_match_list)} sequence IDs had no match in the BLAST database {database}")
# write no match list to a file in blast_outputs
with open(f"blast_outputs/{database}_no_match.txt","w") as f:
for i, fname in enumerate(no_match_list):
if i!=len(no_match_list)-1:
f.write(f"{fname}\n")
else:
f.write(f"{fname}")
# write a subset of fuson_ht_db containing these sequences as well
no_match_ids = [x.split('_')[0] for x in no_match_list]
subset = fuson_ht_db.loc[
fuson_ht_db['seq_id'].isin(no_match_ids)
].reset_index(drop=True)
subset.to_csv(f"blast_outputs/{database}_no_match.csv",index=False)
return no_match_ids
def make_fuson_ht_db(path_to_fuson_db="../fuson_db.csv", path_to_unimap="../head_tail_data/htgenes_uniprotids.csv",savepath="fuson_ht_db.csv"):
"""
Make a version of the fuson_db that has all the heads and tails for each of the genes. Will make it easier to analyze blast results
"""
if os.path.exists(savepath):
df = pd.read_csv(savepath)
return df
# read both of teh databases
fuson_db = pd.read_csv(path_to_fuson_db)
ht_db = pd.read_csv(path_to_unimap)
# Make it such that each row of fuson_db just has ONE head and ONE tail
fuson_ht_db = fuson_db.copy(deep=True)
fuson_ht_db['fusiongenes'] = fuson_ht_db['fusiongenes'].apply(lambda x: x.split(','))
fuson_ht_db = fuson_ht_db.explode('fusiongenes')
fuson_ht_db['hgene'] = fuson_ht_db['fusiongenes'].str.split('::',expand=True)[0]
fuson_ht_db['tgene'] = fuson_ht_db['fusiongenes'].str.split('::',expand=True)[1]
# Merge on head, then merge on tail
fuson_ht_db = pd.merge( # merge on head
fuson_ht_db,
ht_db.rename(columns={
'Gene': 'hgene',
'UniProtID': 'hgUniProt',
'Reviewed': 'hgUniProtReviewed'
}),
on='hgene',
how='left'
)
fuson_ht_db = pd.merge( # merge on tail
fuson_ht_db,
ht_db.rename(columns={
'Gene': 'tgene',
'UniProtID': 'tgUniProt',
'Reviewed': 'tgUniProtReviewed'
}),
on='tgene',
how='left'
)
# Make sure we haven't lost anything
tot_og_seqids = len(fuson_db['seq_id'].unique())
tot_final_seqids = len(fuson_ht_db['seq_id'].unique())
log_update(f"\tTotal sequence IDs in combined database = {tot_final_seqids}. Matches expected: {tot_final_seqids==tot_og_seqids}")
# Each fusion should have the same number of ROWS as it does commas+1
fuson_db['n_commas'] = fuson_db['fusiongenes'].str.count(',') + 1
seqid_rows_map = dict(zip(fuson_db['seq_id'],fuson_db['n_commas']))
vc = fuson_ht_db['seq_id'].value_counts().reset_index()
vc['expected_count'] = vc['index'].map(seqid_rows_map)
log_update(f"\tEach seq_id has the expected number of head-tail combos: {(vc['expected_count']==vc['seq_id']).all()}")
log_update(f"\tPreview of combined database:")
prev = fuson_ht_db.head(10)
prev['aa_seq'] = prev['aa_seq'].apply(lambda x: x[0:10]+'...')
log_update(prev.to_string(index=False))
fuson_ht_db.to_csv(savepath, index=False)
return fuson_ht_db
def format_dict(d, indent=0):
"""
Recursively formats a dictionary for display purposes.
Args:
d (dict): The dictionary to format.
indent (int): The current level of indentation.
Returns:
str: A formatted string representing the dictionary.
"""
formatted_str = ""
# Iterate through each key-value pair in the dictionary
for key, value in d.items():
# Create the current indentation
current_indent = " " * (indent * 4)
# Add the key
formatted_str += f"{current_indent}{repr(key)}: "
# Check the type of the value
if isinstance(value, dict):
# If dictionary, call format_dict recursively
formatted_str += "{\n" + format_dict(value, indent + 1) + current_indent + "},\n"
elif isinstance(value, list):
# If list, convert it to a formatted string
formatted_str += f"[{', '.join(repr(item) for item in value)}],\n"
elif isinstance(value, str):
# If string, enclose in quotes
formatted_str += f"'{value}',\n"
elif value is None:
# If None, display as 'None'
formatted_str += "None,\n"
else:
formatted_str += f"{repr(value)},\n"
return formatted_str
def parse_blast_output(file_path, head_ids, tail_ids):
"""
Args:
- file_path: /path/to/blast/output
- head_ids: list of all UniProt IDs for the head protien
- tail_ids: list of all UniProt IDs for the tail protein
"""
target_ids = list(set(head_ids + tail_ids)) # make a list to make some functions easier
with open(file_path, 'r') as file:
best_data = {tid: None for tid in target_ids} # stores the best alignment for each ID we care about
current_data = {tid: {} for tid in target_ids} # stores the current data for each ID we care about (most recent alignment we read)
best_score = {tid: -float('inf') for tid in target_ids} # stores the best score for each ID we care about
capture = {tid: False for tid in target_ids} # whether we are currently processing this ID
replace_best = {tid: False for tid in target_ids} # whether we should replace the best_data with the current_data for this ID
isoform_dict = {tid: None for tid in target_ids} # dictionary of isoforms for
# variables that will only be used for getting the best alignment
alignment_count = 0
cur_id = None
on_best_alignment=False
# Iterate through lines
for line in file:
line = line.strip()
# if NEW ID (not necessarily new alignment! can be multiple alignmetns under one >)
if line.startswith('>'):
found_tid_in_header=False # assume we have not found a target ID we are looking for
alignment_count+=1
if alignment_count==1: # we're on the best alignment because this is the one that's listed first! it should be
on_best_alignment=True
else:
on_best_alignment = False
## We may have just finisehd processing an ID. Check for the one who currently has capture set to true
just_captured = None
total_captured = 0
for k, v in capture.items():
if v:
total_captured+=1
just_captured = k
# we should never be capturing more than one thing at a time. make sure of this
assert total_captured<2
if just_captured is not None:
if replace_best[just_captured]: # if we just finished an alignment for the just_captured ID, and it's the best one, put it in
best_data[just_captured] = current_data[just_captured].copy()
replace_best[just_captured] = False # we just did the replacement, so reset it
# Check if the line contains any of the target IDs.
# This means EITHER [UniProtID] or [UniProtID.Isoform] or [UniProtID-Isoform] is in the line
for tid in target_ids:
pattern = fr">{tid}([.-]\d+)? " # for ID P02671, would match ">P02671 ", ">P02671.2 " and ">P02671-2 "
if re.search(pattern, line): # if this ID matches
isoform_dict[tid] = None # set it to None, update it if we need to
if "." in line: # look for isoform denoted by . if there is one, otherwise it'll stay as None
isoform = int(line.split(".")[1].split(" ")[0])
isoform_dict[tid] = isoform
#print(f"\t\tID = {tid} (is a head or tail), isoform={isoform}")
elif "-" in line: # look for isoform denoted by - if there is one, otherwise it'll stay as None
isoform = int(line.split("-")[1].split(" ")[0])
isoform_dict[tid] = isoform
#print(f"\t\tID = {tid} (is a head or tail), isoform={isoform}")
capture[tid] = True
current_data[tid] = {'header': line}
found_tid_in_header=True # we've found the tid that's in this line, so no need to check theothers
else:
capture[tid] = False
if on_best_alignment: # if this is the best alignment
if not(found_tid_in_header): # if none of our TIDs are it
cur_id_full = line.split('>')[1].split(' ')[0]
cur_id, isoform = cur_id_full, None
isoform_dict[cur_id] = None # change this if we need
if "." in cur_id_full: # if there's a dot, it's an isoform.
cur_id = cur_id_full.split(".")[0]
isoform = int(cur_id_full.split(".")[1])
isoform_dict[cur_id] = isoform
#log_update(f"\t\tID = {cur_id} (best alignment, not a head or tail), isoform={isoform}")
#log_update(f"\t\t\tFull line: {line}") # so we can see the gene name. does it make sense?
elif "-" in cur_id_full: # if there's a -, it's an isoform.
cur_id = cur_id_full.split("-")[0]
isoform = int(cur_id_full.split("-")[1])
isoform_dict[cur_id] = isoform
#log_update(f"\t\tID = {cur_id} (best alignment, not a head or tail), isoform={isoform}")
#log_update(f"\t\t\tFull line: {line}") # so we can see the gene name. does it make sense?
# add this id to all the dictionaries
best_data[cur_id] = None
current_data[cur_id] = {}
best_score[cur_id] = -float('inf')
capture[cur_id] = False
replace_best[cur_id] = False
for tid in target_ids:
if capture[tid]: # if we're currently on an alignment for a tid we care about
if 'Score =' in line:
if replace_best[tid]: # if we're replacing the best alignment with this one, within the same ID, do it
best_data[tid] = current_data[tid].copy()
# now reset the variable!
replace_best[tid] = False
score_value = float(line.split()[2]) # Assuming "Score = 1053 bits (2723)" format
current_data[tid] = {} # Reset current_data for this ID
current_data[tid]['Isoform'] = isoform_dict[tid]
current_data[tid]['Score'] = score_value
current_data[tid]['Expect'] = line.split('Expect =')[1].split(', Method')[0].strip()
current_data[tid]['Query_Aligned'] = []
current_data[tid]['Subject_Aligned'] = []
# Set the ID as a head or tail, or neither (neither shouldn't happen here though)
if tid in head_ids:
current_data[tid]['H_or_T'] = 'Head'
if tid in tail_ids:
current_data[tid]['H_or_T'] = 'Head,Tail'
elif tid in tail_ids:
current_data[tid]['H_or_T'] = 'Tail'
else:
current_data[tid]['H_or_T'] = np.nan
current_data[tid]['Best'] = True if on_best_alignment else False
if score_value > best_score[tid]: # if this is the best score we have for an alignment of this protein
best_score[tid] = score_value
replace_best[tid] = True
else:
replace_best[tid] = False
if 'Identities =' in line:
idents = line.split(', ')
current_data[tid]['Identities'] = idents[0].split('=')[1].strip()
current_data[tid]['Positives'] = idents[1].split('=')[1].strip()
current_data[tid]['Gaps'] = idents[2].split('=')[1].strip()
if line.startswith('Query'):
parts = line.split()
if 'Query_Start' not in current_data[tid]:
current_data[tid]['Query_Start'] = int(parts[1])
current_data[tid]['Query_End'] = int(parts[3])
current_data[tid]['Query_Aligned'].append(parts[2])
if line.startswith('Sbjct'):
parts = line.split()
if 'Sbjct_Start' not in current_data[tid]:
current_data[tid]['Sbjct_Start'] = int(parts[1])
current_data[tid]['Sbjct_End'] = int(parts[3])
current_data[tid]['Subject_Aligned'].append(parts[2])
# if we're on the best alignment and it's not one of our target_ids, still process it the same way
if on_best_alignment:
if not(found_tid_in_header):
if 'Score =' in line:
if replace_best[cur_id]: # if we're replacing the best alignment with this one, within the same ID, do it
best_data[cur_id] = current_data[cur_id].copy()
# now reset the variable!
replace_best[cur_id] = False
score_value = float(line.split()[2]) # Assuming "Score = 1053 bits (2723)" format
current_data[cur_id] = {} # Reset current_data for this ID
current_data[cur_id]['Isoform'] = isoform_dict[cur_id]
current_data[cur_id]['Score'] = score_value
current_data[cur_id]['Expect'] = line.split('Expect =')[1].split(', Method')[0].strip()
current_data[cur_id]['Query_Aligned'] = []
current_data[cur_id]['Subject_Aligned'] = []
# Set the ID as a head or tail, or neither
if cur_id in head_ids:
current_data[cur_id]['H_or_T'] = 'Head'
if cur_id in tail_ids:
current_data[cur_id]['H_or_T'] = 'Head,Tail'
elif cur_id in tail_ids:
current_data[cur_id]['H_or_T'] = 'Tail'
else:
current_data[cur_id]['H_or_T'] = np.nan
current_data[cur_id]['Best'] = True
if score_value > best_score[cur_id]: # if this is the best score we have for an alignment of this protein
best_score[cur_id] = score_value
replace_best[cur_id] = True
else:
replace_best[cur_id] = False
if 'Identities =' in line:
idents = line.split(', ')
current_data[cur_id]['Identities'] = idents[0].split('=')[1].strip()
current_data[cur_id]['Positives'] = idents[1].split('=')[1].strip()
current_data[cur_id]['Gaps'] = idents[2].split('=')[1].strip()
if line.startswith('Query'):
parts = line.split()
if 'Query_Start' not in current_data[cur_id]:
current_data[cur_id]['Query_Start'] = int(parts[1])
current_data[cur_id]['Query_End'] = int(parts[3])
current_data[cur_id]['Query_Aligned'].append(parts[2])
if line.startswith('Sbjct'):
parts = line.split()
if 'Sbjct_Start' not in current_data[cur_id]:
current_data[cur_id]['Sbjct_Start'] = int(parts[1])
current_data[cur_id]['Sbjct_End'] = int(parts[3])
current_data[cur_id]['Subject_Aligned'].append(parts[2])
# add cur_id to target_ids if it's not none
if not(cur_id is None):
target_ids += [cur_id]
# Check at the end of the file if the last scores are the best
for tid in target_ids:
if replace_best[tid]:
best_data[tid] = current_data[tid].copy()
# Combine sequences into single strings for the best data for each ID
for tid in target_ids:
#print(tid)
if best_data[tid]:
#print(f"there is a best alignment for {tid}")
#print(f"best: {best_data[tid]}")
#print(f"current: {current_data[tid]}")
best_data[tid]['Query_Aligned'] = ''.join(best_data[tid]['Query_Aligned'])
best_data[tid]['Subject_Aligned'] = ''.join(best_data[tid]['Subject_Aligned'])
return best_data
def parse_all_blast_results(fuson_ht_db, database="swissprot"):
"""
Analyze the BLAST outputs for each fusion protein against UniProt.
Use the fuson_ht_db to look for the heads and tails that we expect. If they can't be found, ... ?
"""
output_file=f"blast_outputs/{database}_blast_output_analyzed.pkl"
all_seq_ids = fuson_ht_db['seq_id'].unique().tolist()
all_seq_ids = sorted(all_seq_ids, key=lambda x: int(re.search(r'\d+', x).group())) # sort by the number. seq1, seq2, ...
prior_results = {}
if os.path.exists(output_file):
with open(output_file, "rb") as f:
prior_results = pickle.load(f)
# Iterate through seq_ids
total_parse_time = 0
tot_seqs_processed = 0
for seq_id in all_seq_ids:
try:
tot_seqs_processed+=1
# If we've already processed it, skip
if seq_id in prior_results:
log_update(f"\tAlready processed {seq_id} blast results. Continuing")
continue
file_path = f"blast_outputs/{database}/{seq_id}_{database}_results.out"
aa_seq = fuson_ht_db.loc[
fuson_ht_db['seq_id']==seq_id
]['aa_seq'].tolist()[0]
# Remember, fuson_ht_db has all the IDs for ALL the different head and tail gene identifiers.
fusion_genes = fuson_ht_db.loc[
fuson_ht_db['seq_id']==seq_id
]['fusiongenes'].tolist()
##### Process heads
head_ids = fuson_ht_db.loc[
fuson_ht_db['seq_id']==seq_id
]['hgUniProt'].dropna().tolist()
head_reviewed, head_reviewed_dict = "", {}
if len(head_ids)>0: # if we found head IDs, we can process them and figure out if they're reviewed
head_ids = ",".join(head_ids).split(",")
head_reviewed = fuson_ht_db.loc[
fuson_ht_db['seq_id']==seq_id
]['hgUniProtReviewed'].dropna().tolist()
head_reviewed = list("".join(head_reviewed))
head_reviewed_dict = dict(zip(head_ids, head_reviewed))
head_ids = list(head_reviewed_dict.keys()) # there may be some duplicates, so separate them out again
head_reviewed = list(head_reviewed_dict.values())
head_genes = fuson_ht_db.loc[
fuson_ht_db['seq_id']==seq_id
]['hgene'].unique().tolist()
##### Process tails - same logic
tail_ids = fuson_ht_db.loc[
fuson_ht_db['seq_id']==seq_id
]['tgUniProt'].dropna().tolist()
tail_reviewed, tail_reviewed_dict = "", {}
if len(tail_ids)>0: # if we found tail IDs, we can process them and figure out if they're reviewed
tail_ids = ",".join(tail_ids).split(",")
tail_reviewed = fuson_ht_db.loc[
fuson_ht_db['seq_id']==seq_id
]['tgUniProtReviewed'].dropna().tolist()
tail_reviewed = list("".join(tail_reviewed))
tail_reviewed_dict = dict(zip(tail_ids, tail_reviewed))
tail_ids = list(tail_reviewed_dict.keys()) # there may be some duplicates, so separate them out again
tail_reviewed = list(tail_reviewed_dict.values())
tail_genes = fuson_ht_db.loc[
fuson_ht_db['seq_id']==seq_id
]['tgene'].unique().tolist()
###### Log what we just found
log_update(f"\tEvaluating {seq_id}, fusion genes = {fusion_genes}, len = {len(aa_seq)}...\n\t\tfile_path={file_path}")
#log_update(f"\n\t\thead genes={head_genes}\n\t\thead_ids={head_ids}\n\t\ttail genes={tail_genes}\n\t\ttail_ids={tail_ids}")
### Do the analysis and time it
parse_start_time = time.time() # time it
blast_data = parse_blast_output(file_path, head_ids, tail_ids)
parse_end_time = time.time()
parse_seq_time = parse_end_time-parse_start_time
total_parse_time+=parse_seq_time
log_update(f"\t\tBLAST output analysis completed for {seq_id} ({parse_seq_time:.2f}s)")
# Give preview of results. Logging the whole dict would be too much, so let's just see what we found
#log_update(format_dict(blast_data,indent=3))
n_og_reviewed_head_ids = len([x for x in head_reviewed if x=='1'])
found_head_ids = [x for x in list(blast_data.keys()) if (blast_data[x] is not None) and (blast_data[x].get('H_or_T',None) in ['Head','Head,Tail'])]
n_found_reviewed_head_ids = len([x for x in found_head_ids if head_reviewed_dict[x]=='1'])
n_og_reviewed_tail_ids = len([x for x in tail_reviewed if x=='1'])
found_tail_ids = [x for x in list(blast_data.keys()) if (blast_data[x] is not None) and (blast_data[x].get('H_or_T',None) in ['Tail','Head,Tail'])]
n_found_reviewed_tail_ids = len([x for x in found_tail_ids if tail_reviewed_dict[x]=='1'])
#log_update(f"\t\t{len(found_head_ids)}/{len(head_ids)} head protein UniProt IDs ({n_found_reviewed_head_ids}/{n_og_reviewed_head_ids} REVIEWED heads) had alignments")
#log_update(f"\t\t{len(found_tail_ids)}/{len(tail_ids)} tail protein UniProt IDs ({n_found_reviewed_tail_ids}/{n_og_reviewed_tail_ids} REVIEWED tails) had alignments")
# write results to pickle file
to_pickle_dict = {seq_id: blast_data}
with open(output_file, 'ab+') as f:
pickle.dump(to_pickle_dict, f)
except:
log_update(f"{seq_id} failed")
# redump the pickle even if we hit an error, so that we can fix the error and continue processing results
redump_pickle_dictionary(output_file)
# Log total time
log_update(f"\tFinished processing {tot_seqs_processed} sequences ({total_parse_time:.2f}s)")
# redump the pickle
redump_pickle_dictionary(output_file)
def analyze_blast_results(fuson_ht_db, database="swissprot"):
blast_results_path=f"blast_outputs/{database}_blast_output_analyzed.pkl"
stats_df_savepath = f"blast_outputs/{database}_blast_stats.csv"
top_alignments_df_savepath = f"blast_outputs/{database}_top_alignments.csv"
stats_df, top_alignments_df = None, None
if os.path.exists(stats_df_savepath) and os.path.exists(top_alignments_df_savepath):
stats_df = pd.read_csv(stats_df_savepath)
top_alignments_df = pd.read_csv(top_alignments_df_savepath, dtype={'top_hg_UniProt_isoform':'str',
'top_tg_UniProt_isoform': 'str',
'top_UniProt_isoform': 'str'})
else:
with open(blast_results_path, "rb") as f:
results = pickle.load(f)
# analyze the results
# first, basic stats. How many of them have at least one head or tail alignment??
seqid_stats = {}
top_alignments_dict = {}
for seq_id in list(results.keys()):
seqid_stats[seq_id] = {
'hgAlignments': 0,
'tgAlignments': 0,
'totalAlignments': 0,
'best_hgScore': 0,
'best_tgScore': 0,
'best_Score': 0
}
top_alignments_dict[seq_id] = {
'top_hg_UniProtID': None,
'top_hg_UniProt_isoform': None,
'top_hg_UniProt_fus_indices': None,
'top_tg_UniProtID': None,
'top_tg_UniProt_isoform': None,
'top_tg_UniProt_fus_indices': None,
'top_UniProtID': None,
'top_UniProt_isoform': None,
'top_UniProt_fus_indices': None
}
for uniprot, d in results[seq_id].items():
if not(d is None):
isoform = d['Isoform']
# set up the indices string
query_start = d['Query_Start']
if (query_start is None) or (type(query_start)==float and np.isnan(query_start)):
query_start = ''
else:
query_start = int(query_start)
query_end = d['Query_End']
if (query_end is None) or (type(query_end)==float and np.isnan(query_end)):
query_end = ''
else:
query_end = int(query_end)
fus_indices = f"{query_start},{query_end}".strip(",")
if d['H_or_T'] in ['Head', 'Head,Tail']:
seqid_stats[seq_id]['hgAlignments'] +=1
if d['Score'] > seqid_stats[seq_id]['best_hgScore']:
seqid_stats[seq_id]['best_hgScore'] = d['Score']
if type(uniprot)==float or uniprot is None:
top_alignments_dict[seq_id]['top_hg_UniProtID'] = ''
else:
top_alignments_dict[seq_id]['top_hg_UniProtID'] = uniprot
if (type(isoform)==float and np.isnan(isoform)) or isoform is None:
top_alignments_dict[seq_id]['top_hg_UniProt_isoform'] = ''
else:
top_alignments_dict[seq_id]['top_hg_UniProt_isoform'] = str(int(isoform))
top_alignments_dict[seq_id]['top_hg_UniProt_fus_indices'] = fus_indices
if d['H_or_T'] in ['Tail','Head,Tail']:
seqid_stats[seq_id]['tgAlignments'] +=1
if d['Score'] > seqid_stats[seq_id]['best_tgScore']:
seqid_stats[seq_id]['best_tgScore'] = d['Score']
if type(uniprot)==float or uniprot is None:
top_alignments_dict[seq_id]['top_tg_UniProtID'] = ''
else:
top_alignments_dict[seq_id]['top_tg_UniProtID'] = uniprot
if (type(isoform)==float and np.isnan(isoform)) or isoform is None:
top_alignments_dict[seq_id]['top_tg_UniProt_isoform'] = ''
else:
top_alignments_dict[seq_id]['top_tg_UniProt_isoform'] = str(int(isoform))
top_alignments_dict[seq_id]['top_tg_UniProt_fus_indices'] = fus_indices
# increment total no matter what type of alignment it is
seqid_stats[seq_id]['totalAlignments']+=1
#if d['Score'] > seqid_stats[seq_id]['best_Score']:
if d['Best']==True: # should be indicated if this is the best!!
seqid_stats[seq_id]['best_Score'] = d['Score']
if type(uniprot)==float or uniprot is None:
top_alignments_dict[seq_id]['top_UniProtID'] = ''
else:
top_alignments_dict[seq_id]['top_UniProtID'] = uniprot
if (type(isoform)==float and np.isnan(isoform)) or isoform is None:
top_alignments_dict[seq_id]['top_UniProt_isoform'] = ''
else:
top_alignments_dict[seq_id]['top_UniProt_isoform'] = str(int(isoform))
top_alignments_dict[seq_id]['top_UniProt_fus_indices'] = fus_indices
# now get positives and identities
if 'Identities' not in d: print(seq_id, uniprot, d.keys())
identities = d['Identities']
identities = int(identities.split('/')[0])
positives = d['Positives']
positives = int(positives.split('/')[0])
top_alignments_dict[seq_id]['top_UniProt_nIdentities'] = identities
top_alignments_dict[seq_id]['top_UniProt_nPositives'] = positives
stats_df = pd.DataFrame.from_dict(seqid_stats, orient='index').reset_index().rename(columns={'index':'seq_id'})
stats_df['h_or_t_alignment'] = stats_df.apply(lambda row: True if (row['hgAlignments']>0 or row['tgAlignments']>0) else False, axis=1)
stats_df['h_and_t_alignment'] = stats_df.apply(lambda row: True if (row['hgAlignments']>0 and row['tgAlignments']>0) else False, axis=1)
stats_df.to_csv(stats_df_savepath,index=False)
top_alignments_df = pd.DataFrame.from_dict(top_alignments_dict, orient='index').reset_index().rename(columns={'index':'seq_id'})
# add in the sequence length so we can get percentages
fusion_id_seq_dict = dict(zip(fuson_ht_db['seq_id'],fuson_ht_db['aa_seq']))
assert len(fusion_id_seq_dict) == len(fuson_ht_db['seq_id'].unique()) == len(fuson_ht_db['aa_seq'].unique())
top_alignments_df['aa_seq_len'] = top_alignments_df['seq_id'].map(fusion_id_seq_dict).str.len()
top_alignments_df.to_csv(top_alignments_df_savepath,index=False)
# also, find which ones have no match at all
# does it match?
no_match_list1 = find_nomatch_blasts(fuson_ht_db, database=database)
log_update(stats_df.head(10).to_string())
# how many have at least one head or tail?
log_update(f"Total sequences: {len(stats_df)}")
log_update(f"Sequences with >=1 head alignment: {len(stats_df.loc[stats_df['hgAlignments']>0])}")
log_update(f"Sequences with >=1 tail alignment: {len(stats_df.loc[stats_df['tgAlignments']>0])}")
log_update(f"Sequences with >=1 head OR tail alignment: {len(stats_df.loc[stats_df['h_or_t_alignment']])}")
log_update(f"Sequences with >=1 head AND tail alignment: {len(stats_df.loc[stats_df['h_and_t_alignment']])}")
log_update(f"Sequences with ANY alignment: {len(stats_df.loc[stats_df['totalAlignments']>0])}")
top_alignments_df = top_alignments_df.replace({None: ''})
log_update(f"Preview of top alignments for {database} search:\n{top_alignments_df.head(10).to_string(index=False)}")
top_alignments_df['hiso'] = top_alignments_df['top_hg_UniProtID']+'-'+top_alignments_df['top_hg_UniProt_isoform']
top_alignments_df['tiso'] = top_alignments_df['top_tg_UniProtID']+'-'+top_alignments_df['top_tg_UniProt_isoform']
top_alignments_df['biso'] = top_alignments_df['top_UniProtID']+'-'+top_alignments_df['top_UniProt_isoform']
top_hgs = set([x.strip('-') for x in top_alignments_df['hiso'].tolist()]) # if things don't have isoforms they'll just end in -
top_tgs = set([x.strip('-') for x in top_alignments_df['tiso'].tolist()])
top_bgs = set([x.strip('-') for x in top_alignments_df['biso'].tolist()])
top_gs = top_hgs | top_tgs | top_bgs
log_update(f"\nTotal unique head proteins (including isoform) producing top head alignments: {len(top_hgs)}")
log_update(f"\nTotal unique tail proteins (including isoform) producing top tail alignments: {len(top_tgs)}")
log_update(f"\nTotal unique proteins (including isoform) - head, tail, or neither - producing top alignments: {len(top_gs)}")
return stats_df, top_alignments_df
def compare_database_blasts(fuson_ht_db, swissprot_blast_stats, fusion_hts_blast_stats, make_new_plots=True):
# let's start by just returning a list of IDs that were
# cols = seq_id hgAlignments tgAlignments totalAlignments best_hgScore best_tgScore best_Score h_or_t_alignment h_and_t_alignment
# distinguish the columns
og_cols = list(swissprot_blast_stats.columns)[1::]
for c in og_cols:
if c!='seq_id':
swissprot_blast_stats = swissprot_blast_stats.rename(columns={c: f"swiss_{c}"})
for c in og_cols:
if c!='seq_id':
fusion_hts_blast_stats = fusion_hts_blast_stats.rename(columns={c: f"hts_{c}"})
# merge
merged = pd.merge(swissprot_blast_stats,
fusion_hts_blast_stats,
on='seq_id',
how='outer')
diff_cols = og_cols[0:-2]
differences = pd.DataFrame(columns=diff_cols)
log_update(f"Making volcano plots of the differences between fusion head-tail BLAST and swissprot BLAST in the following columns:\n\t{','.join(diff_cols)}")
for c in diff_cols:
differences[c] = merged[f"hts_{c}"] - merged[f"swiss_{c}"]
# make some box plots of differences
# Generate volcano plots for each column
if make_new_plots:
os.makedirs("figures",exist_ok=True)
os.makedirs("figures/database_comparison",exist_ok=True)
os.makedirs("figures/database_comparison/differences",exist_ok=True)
os.makedirs("figures/database_comparison/values",exist_ok=True)
os.makedirs("figures/database_comparison/box",exist_ok=True)
group_difference_plot(differences)
group_swiss_and_ht_plot(merged.drop(columns=['seq_id']), diff_cols)
group_box_plot(merged.drop(columns=['seq_id']), diff_cols)
def fasta_to_dataframe(fasta_file):
# Read the file into a DataFrame with a single column
df = pd.read_fwf(fasta_file, header=None, colspecs=[(0, None)], names=['content'])
# Select even and odd lines using pandas slicing
ids = df.iloc[::2].reset_index(drop=True) # Even-indexed lines (IDs)
sequences = df.iloc[1::2].reset_index(drop=True) # Odd-indexed lines (sequences)
# Combine into a new DataFrame
fasta_df = pd.DataFrame({'ID': ids['content'], 'Sequence': sequences['content']})
fasta_df['ID'] = fasta_df['ID'].str.split('>',expand=True)[1]
fasta_df['Sequence'] = fasta_df['Sequence'].str.strip().str.strip('\n')
# print a preview of this
temp = fasta_df.head(10)
temp['Sequence'] = temp['Sequence'].apply(lambda x: x[0:10]+'...')
log_update(f"Preview of head/tail fasta sequences in a dataframe:\n{temp.to_string(index=False)}")
return fasta_df
def get_ht_uniprot_query(swissprot_top_alignments_df):
'''
Use swissprot_top_alignments_df to curate all the unique UniProt IDs (ID.Isoform) that created top head and tail alignments
'''
swissprot_top_alignments_df['top_hg_full'] = swissprot_top_alignments_df['top_hg_UniProtID']+'.'+swissprot_top_alignments_df['top_hg_UniProt_isoform']
swissprot_top_alignments_df['top_tg_full'] = swissprot_top_alignments_df['top_tg_UniProtID']+'.'+swissprot_top_alignments_df['top_tg_UniProt_isoform']
unique_heads = swissprot_top_alignments_df.loc[
swissprot_top_alignments_df['top_hg_UniProtID'].notna()
]['top_hg_full'].unique().tolist()
unique_tails = swissprot_top_alignments_df.loc[
swissprot_top_alignments_df['top_tg_UniProtID'].notna()
]['top_tg_full'].unique().tolist()
unique_ht = set(unique_heads).union(set(unique_tails))
unique_ht = list(unique_ht)
unique_ht = [x for x in unique_ht if len(x)>1] # not just "."
with open("blast_outputs/ht_uniprot_query.txt", "w") as f:
for i, ht in enumerate(unique_ht):
if i!= len(unique_ht)-1:
f.write(f"{ht}\n")
else:
f.write(f"{ht}")
def main():
# Later, add the argparse thing back in here and change where the log is and what happens depending on wht the user decides
# May need to separate blast prep from actual blast for the manuscript, but worry about this later
with open_logfile(f"fusion_blast_log.txt"):
# Start by preparing BLAST inputs
prepare_blast_inputs()
# Then run BLAST
run_blast("blast_inputs",database="swissprot")
###### Analyze BLAST results
# Make database with head and tail info for each fusion, so we know what to expect
fuson_ht_db = make_fuson_ht_db(savepath="fuson_ht_db.csv")
#parse_all_blast_results(fuson_ht_db, database="swissprot")
swissprot_blast_stats, swissprot_top_alignments_df = analyze_blast_results(fuson_ht_db,database="swissprot")
swissprot_top_alignments_df = pd.read_csv("blast_outputs/swissprot_top_alignments.csv")
get_ht_uniprot_query(swissprot_top_alignments_df)
os.makedirs("figures/top_blast_visuals",exist_ok=True)
group_pos_id_plot(swissprot_top_alignments_df)
if __name__ == '__main__':
main()