|
|
|
import pandas as pd |
|
import os |
|
import subprocess |
|
import time |
|
import re |
|
import pickle |
|
import numpy as np |
|
|
|
from fuson_plm.utils.logging import log_update, open_logfile |
|
from fuson_plm.utils.embedding import redump_pickle_dictionary |
|
from fuson_plm.data.blast.plot import group_difference_plot, group_swiss_and_ht_plot, group_box_plot, group_pos_id_plot |
|
|
|
def prepare_blast_inputs(): |
|
log_update("\nPreparing BLAST Inputs. Logging every 1000 sequences... ") |
|
|
|
os.makedirs("blast_inputs", exist_ok=True) |
|
|
|
|
|
fuson_db = pd.read_csv('../fuson_db.csv') |
|
|
|
|
|
fuson_db_dict = dict(zip(fuson_db['aa_seq'],fuson_db['seq_id'])) |
|
|
|
|
|
new_fa_files_created = 0 |
|
old_fa_files_found = 0 |
|
total_seqs_processed=0 |
|
for i, (seq, seqid) in enumerate(fuson_db_dict.items()): |
|
total_seqs_processed+=1 |
|
|
|
if os.path.exists(f"blast_inputs/{seqid}.fa"): |
|
old_fa_files_found+=1 |
|
else: |
|
new_fa_files_created+=1 |
|
with open(f"blast_inputs/{seqid}.txt", 'w') as f: |
|
fasta_lines = '>' + seqid + '\n' + seq |
|
f.write(fasta_lines) |
|
|
|
os.rename(f"blast_inputs/{seqid}.txt", f"blast_inputs/{seqid}.fa") |
|
|
|
if i%1000==0: |
|
log_update(f"\t\t{i}\t{seqid}:{seq}") |
|
|
|
log_update("\tFinished preparing BLAST Inputs (results in blast_inputs folder)") |
|
log_update(f"\t\tSequences processed: {total_seqs_processed}/{len(fuson_db)} seqs in FusOn-DB\n\t\tFasta files found: {old_fa_files_found}\n\t\tNew fasta files created: {new_fa_files_created}") |
|
|
|
def run_blast(blast_inputs_dir, database="swissprot",n=1,interval=2000): |
|
""" |
|
Run BLAST on all files in blast_inputs_dir |
|
""" |
|
|
|
os.environ['PATH'] += ":./ncbi-blast-2.16.0+/bin" |
|
os.environ['BLASTDB'] = f"ncbi-blast-2.16.0+/{database}" |
|
|
|
|
|
os.makedirs("blast_outputs", exist_ok=True) |
|
os.makedirs(f"blast_outputs/{database}", exist_ok=True) |
|
already_blasted = os.listdir(f"blast_outputs/{database}") |
|
blast_input_files = os.listdir(blast_inputs_dir) |
|
|
|
blast_input_files = sorted(blast_input_files, key=lambda x: int(re.search(r'\d+', x).group())) |
|
|
|
|
|
log_update(f"Running BLAST.\n\t{len(blast_input_files)} input files\n\t{len(already_blasted)} already blasted\n") |
|
|
|
tot_seqs_processed = 0 |
|
total_blast_time = 0 |
|
|
|
start_i = interval*(n-1) |
|
end_i = interval*n |
|
if end_i>len(blast_input_files): end_i = len(blast_input_files) |
|
for i, blast_input_file in enumerate(blast_input_files[start_i:end_i]): |
|
tot_seqs_processed+=1 |
|
|
|
seqid = blast_input_file.split('.fa')[0] |
|
input_path = f"blast_inputs/{blast_input_file}" |
|
output_path = f"blast_outputs/{database}/{seqid}_{database}_results.out" |
|
|
|
if os.path.exists(output_path): |
|
log_update(f"\t{i+1}.\tAlready blasted {seqid}") |
|
continue |
|
|
|
|
|
command = [ |
|
"ncbi-blast-2.16.0+/bin/blastp", |
|
"-db", database, |
|
"-query", input_path, |
|
"-out", output_path |
|
] |
|
|
|
|
|
blast_start_time = time.time() |
|
result = subprocess.run(command, capture_output=True, text=True) |
|
blast_end_time = time.time() |
|
blast_seq_time = blast_end_time-blast_start_time |
|
total_blast_time+=blast_seq_time |
|
|
|
|
|
if result.returncode != 0: |
|
log_update(f"\t{i+1}.\tError running BLAST for {seqid}: {result.stderr} ({blast_seq_time:.2f}s)") |
|
else: |
|
log_update(f"\t{i+1}.\tBLAST search completed for {seqid} ({blast_seq_time:.2f}s)") |
|
|
|
log_update(f"\tFinished processing {tot_seqs_processed} sequences ({total_blast_time:.2f}s)") |
|
|
|
def remove_incomplete_blasts(database="swissprot"): |
|
incomplete_list = [] |
|
for fname in os.listdir(f"blast_outputs/{database}"): |
|
complete=False |
|
with open(f"blast_outputs/{database}/{fname}", "r") as f: |
|
lines = f.readlines() |
|
if len(lines)>1 and "Window for multiple hits:" in lines[-1]: |
|
complete=True |
|
if not complete: |
|
incomplete_list.append(fname) |
|
|
|
log_update(f"\t{len(incomplete_list)} BLAST files are incomplete (due to BLAST errors). Deleting them. Rerun these") |
|
|
|
for fname in incomplete_list: |
|
os.remove(f"blast_outputs/{database}/{fname}") |
|
|
|
def find_nomatch_blasts(fuson_ht_db, database="swissprot"): |
|
no_match_list = [] |
|
for fname in os.listdir(f"blast_outputs/{database}"): |
|
match=True |
|
with open(f"blast_outputs/{database}/{fname}", "r") as f: |
|
lines = f.readlines() |
|
if len(lines)>1 and "No hits found" in lines[28]: |
|
match=False |
|
if not match: |
|
no_match_list.append(fname) |
|
|
|
log_update(f"\t{len(no_match_list)} sequence IDs had no match in the BLAST database {database}") |
|
|
|
with open(f"blast_outputs/{database}_no_match.txt","w") as f: |
|
for i, fname in enumerate(no_match_list): |
|
if i!=len(no_match_list)-1: |
|
f.write(f"{fname}\n") |
|
else: |
|
f.write(f"{fname}") |
|
|
|
|
|
no_match_ids = [x.split('_')[0] for x in no_match_list] |
|
subset = fuson_ht_db.loc[ |
|
fuson_ht_db['seq_id'].isin(no_match_ids) |
|
].reset_index(drop=True) |
|
subset.to_csv(f"blast_outputs/{database}_no_match.csv",index=False) |
|
|
|
return no_match_ids |
|
|
|
def make_fuson_ht_db(path_to_fuson_db="../fuson_db.csv", path_to_unimap="../head_tail_data/htgenes_uniprotids.csv",savepath="fuson_ht_db.csv"): |
|
""" |
|
Make a version of the fuson_db that has all the heads and tails for each of the genes. Will make it easier to analyze blast results |
|
""" |
|
if os.path.exists(savepath): |
|
df = pd.read_csv(savepath) |
|
return df |
|
|
|
|
|
fuson_db = pd.read_csv(path_to_fuson_db) |
|
ht_db = pd.read_csv(path_to_unimap) |
|
|
|
|
|
fuson_ht_db = fuson_db.copy(deep=True) |
|
fuson_ht_db['fusiongenes'] = fuson_ht_db['fusiongenes'].apply(lambda x: x.split(',')) |
|
fuson_ht_db = fuson_ht_db.explode('fusiongenes') |
|
fuson_ht_db['hgene'] = fuson_ht_db['fusiongenes'].str.split('::',expand=True)[0] |
|
fuson_ht_db['tgene'] = fuson_ht_db['fusiongenes'].str.split('::',expand=True)[1] |
|
|
|
|
|
fuson_ht_db = pd.merge( |
|
fuson_ht_db, |
|
ht_db.rename(columns={ |
|
'Gene': 'hgene', |
|
'UniProtID': 'hgUniProt', |
|
'Reviewed': 'hgUniProtReviewed' |
|
}), |
|
on='hgene', |
|
how='left' |
|
) |
|
fuson_ht_db = pd.merge( |
|
fuson_ht_db, |
|
ht_db.rename(columns={ |
|
'Gene': 'tgene', |
|
'UniProtID': 'tgUniProt', |
|
'Reviewed': 'tgUniProtReviewed' |
|
}), |
|
on='tgene', |
|
how='left' |
|
) |
|
|
|
|
|
tot_og_seqids = len(fuson_db['seq_id'].unique()) |
|
tot_final_seqids = len(fuson_ht_db['seq_id'].unique()) |
|
log_update(f"\tTotal sequence IDs in combined database = {tot_final_seqids}. Matches expected: {tot_final_seqids==tot_og_seqids}") |
|
|
|
fuson_db['n_commas'] = fuson_db['fusiongenes'].str.count(',') + 1 |
|
seqid_rows_map = dict(zip(fuson_db['seq_id'],fuson_db['n_commas'])) |
|
vc = fuson_ht_db['seq_id'].value_counts().reset_index() |
|
vc['expected_count'] = vc['index'].map(seqid_rows_map) |
|
log_update(f"\tEach seq_id has the expected number of head-tail combos: {(vc['expected_count']==vc['seq_id']).all()}") |
|
|
|
log_update(f"\tPreview of combined database:") |
|
prev = fuson_ht_db.head(10) |
|
prev['aa_seq'] = prev['aa_seq'].apply(lambda x: x[0:10]+'...') |
|
log_update(prev.to_string(index=False)) |
|
fuson_ht_db.to_csv(savepath, index=False) |
|
return fuson_ht_db |
|
|
|
def format_dict(d, indent=0): |
|
""" |
|
Recursively formats a dictionary for display purposes. |
|
|
|
Args: |
|
d (dict): The dictionary to format. |
|
indent (int): The current level of indentation. |
|
|
|
Returns: |
|
str: A formatted string representing the dictionary. |
|
""" |
|
formatted_str = "" |
|
|
|
for key, value in d.items(): |
|
|
|
current_indent = " " * (indent * 4) |
|
|
|
formatted_str += f"{current_indent}{repr(key)}: " |
|
|
|
|
|
if isinstance(value, dict): |
|
|
|
formatted_str += "{\n" + format_dict(value, indent + 1) + current_indent + "},\n" |
|
elif isinstance(value, list): |
|
|
|
formatted_str += f"[{', '.join(repr(item) for item in value)}],\n" |
|
elif isinstance(value, str): |
|
|
|
formatted_str += f"'{value}',\n" |
|
elif value is None: |
|
|
|
formatted_str += "None,\n" |
|
else: |
|
formatted_str += f"{repr(value)},\n" |
|
|
|
return formatted_str |
|
|
|
def parse_blast_output(file_path, head_ids, tail_ids): |
|
""" |
|
Args: |
|
- file_path: /path/to/blast/output |
|
- head_ids: list of all UniProt IDs for the head protien |
|
- tail_ids: list of all UniProt IDs for the tail protein |
|
""" |
|
target_ids = list(set(head_ids + tail_ids)) |
|
with open(file_path, 'r') as file: |
|
best_data = {tid: None for tid in target_ids} |
|
current_data = {tid: {} for tid in target_ids} |
|
best_score = {tid: -float('inf') for tid in target_ids} |
|
capture = {tid: False for tid in target_ids} |
|
replace_best = {tid: False for tid in target_ids} |
|
isoform_dict = {tid: None for tid in target_ids} |
|
|
|
|
|
alignment_count = 0 |
|
cur_id = None |
|
on_best_alignment=False |
|
|
|
|
|
for line in file: |
|
line = line.strip() |
|
|
|
if line.startswith('>'): |
|
found_tid_in_header=False |
|
alignment_count+=1 |
|
if alignment_count==1: |
|
on_best_alignment=True |
|
else: |
|
on_best_alignment = False |
|
|
|
|
|
just_captured = None |
|
total_captured = 0 |
|
for k, v in capture.items(): |
|
if v: |
|
total_captured+=1 |
|
just_captured = k |
|
|
|
assert total_captured<2 |
|
if just_captured is not None: |
|
if replace_best[just_captured]: |
|
best_data[just_captured] = current_data[just_captured].copy() |
|
replace_best[just_captured] = False |
|
|
|
|
|
|
|
for tid in target_ids: |
|
pattern = fr">{tid}([.-]\d+)? " |
|
if re.search(pattern, line): |
|
isoform_dict[tid] = None |
|
if "." in line: |
|
isoform = int(line.split(".")[1].split(" ")[0]) |
|
isoform_dict[tid] = isoform |
|
|
|
elif "-" in line: |
|
isoform = int(line.split("-")[1].split(" ")[0]) |
|
isoform_dict[tid] = isoform |
|
|
|
capture[tid] = True |
|
current_data[tid] = {'header': line} |
|
found_tid_in_header=True |
|
else: |
|
capture[tid] = False |
|
|
|
if on_best_alignment: |
|
if not(found_tid_in_header): |
|
cur_id_full = line.split('>')[1].split(' ')[0] |
|
cur_id, isoform = cur_id_full, None |
|
isoform_dict[cur_id] = None |
|
if "." in cur_id_full: |
|
cur_id = cur_id_full.split(".")[0] |
|
isoform = int(cur_id_full.split(".")[1]) |
|
isoform_dict[cur_id] = isoform |
|
|
|
|
|
elif "-" in cur_id_full: |
|
cur_id = cur_id_full.split("-")[0] |
|
isoform = int(cur_id_full.split("-")[1]) |
|
isoform_dict[cur_id] = isoform |
|
|
|
|
|
|
|
best_data[cur_id] = None |
|
current_data[cur_id] = {} |
|
best_score[cur_id] = -float('inf') |
|
capture[cur_id] = False |
|
replace_best[cur_id] = False |
|
|
|
|
|
for tid in target_ids: |
|
if capture[tid]: |
|
if 'Score =' in line: |
|
if replace_best[tid]: |
|
best_data[tid] = current_data[tid].copy() |
|
|
|
replace_best[tid] = False |
|
|
|
score_value = float(line.split()[2]) |
|
current_data[tid] = {} |
|
current_data[tid]['Isoform'] = isoform_dict[tid] |
|
current_data[tid]['Score'] = score_value |
|
current_data[tid]['Expect'] = line.split('Expect =')[1].split(', Method')[0].strip() |
|
current_data[tid]['Query_Aligned'] = [] |
|
current_data[tid]['Subject_Aligned'] = [] |
|
|
|
if tid in head_ids: |
|
current_data[tid]['H_or_T'] = 'Head' |
|
if tid in tail_ids: |
|
current_data[tid]['H_or_T'] = 'Head,Tail' |
|
elif tid in tail_ids: |
|
current_data[tid]['H_or_T'] = 'Tail' |
|
else: |
|
current_data[tid]['H_or_T'] = np.nan |
|
|
|
current_data[tid]['Best'] = True if on_best_alignment else False |
|
if score_value > best_score[tid]: |
|
best_score[tid] = score_value |
|
replace_best[tid] = True |
|
else: |
|
replace_best[tid] = False |
|
|
|
if 'Identities =' in line: |
|
idents = line.split(', ') |
|
current_data[tid]['Identities'] = idents[0].split('=')[1].strip() |
|
current_data[tid]['Positives'] = idents[1].split('=')[1].strip() |
|
current_data[tid]['Gaps'] = idents[2].split('=')[1].strip() |
|
if line.startswith('Query'): |
|
parts = line.split() |
|
if 'Query_Start' not in current_data[tid]: |
|
current_data[tid]['Query_Start'] = int(parts[1]) |
|
current_data[tid]['Query_End'] = int(parts[3]) |
|
current_data[tid]['Query_Aligned'].append(parts[2]) |
|
if line.startswith('Sbjct'): |
|
parts = line.split() |
|
if 'Sbjct_Start' not in current_data[tid]: |
|
current_data[tid]['Sbjct_Start'] = int(parts[1]) |
|
current_data[tid]['Sbjct_End'] = int(parts[3]) |
|
current_data[tid]['Subject_Aligned'].append(parts[2]) |
|
|
|
|
|
if on_best_alignment: |
|
if not(found_tid_in_header): |
|
if 'Score =' in line: |
|
if replace_best[cur_id]: |
|
best_data[cur_id] = current_data[cur_id].copy() |
|
|
|
replace_best[cur_id] = False |
|
|
|
score_value = float(line.split()[2]) |
|
current_data[cur_id] = {} |
|
current_data[cur_id]['Isoform'] = isoform_dict[cur_id] |
|
current_data[cur_id]['Score'] = score_value |
|
current_data[cur_id]['Expect'] = line.split('Expect =')[1].split(', Method')[0].strip() |
|
current_data[cur_id]['Query_Aligned'] = [] |
|
current_data[cur_id]['Subject_Aligned'] = [] |
|
|
|
if cur_id in head_ids: |
|
current_data[cur_id]['H_or_T'] = 'Head' |
|
if cur_id in tail_ids: |
|
current_data[cur_id]['H_or_T'] = 'Head,Tail' |
|
elif cur_id in tail_ids: |
|
current_data[cur_id]['H_or_T'] = 'Tail' |
|
else: |
|
current_data[cur_id]['H_or_T'] = np.nan |
|
|
|
current_data[cur_id]['Best'] = True |
|
if score_value > best_score[cur_id]: |
|
best_score[cur_id] = score_value |
|
replace_best[cur_id] = True |
|
else: |
|
replace_best[cur_id] = False |
|
|
|
if 'Identities =' in line: |
|
idents = line.split(', ') |
|
current_data[cur_id]['Identities'] = idents[0].split('=')[1].strip() |
|
current_data[cur_id]['Positives'] = idents[1].split('=')[1].strip() |
|
current_data[cur_id]['Gaps'] = idents[2].split('=')[1].strip() |
|
if line.startswith('Query'): |
|
parts = line.split() |
|
if 'Query_Start' not in current_data[cur_id]: |
|
current_data[cur_id]['Query_Start'] = int(parts[1]) |
|
current_data[cur_id]['Query_End'] = int(parts[3]) |
|
current_data[cur_id]['Query_Aligned'].append(parts[2]) |
|
if line.startswith('Sbjct'): |
|
parts = line.split() |
|
if 'Sbjct_Start' not in current_data[cur_id]: |
|
current_data[cur_id]['Sbjct_Start'] = int(parts[1]) |
|
current_data[cur_id]['Sbjct_End'] = int(parts[3]) |
|
current_data[cur_id]['Subject_Aligned'].append(parts[2]) |
|
|
|
|
|
if not(cur_id is None): |
|
target_ids += [cur_id] |
|
|
|
|
|
for tid in target_ids: |
|
if replace_best[tid]: |
|
best_data[tid] = current_data[tid].copy() |
|
|
|
|
|
for tid in target_ids: |
|
|
|
if best_data[tid]: |
|
|
|
|
|
|
|
best_data[tid]['Query_Aligned'] = ''.join(best_data[tid]['Query_Aligned']) |
|
best_data[tid]['Subject_Aligned'] = ''.join(best_data[tid]['Subject_Aligned']) |
|
|
|
return best_data |
|
|
|
def parse_all_blast_results(fuson_ht_db, database="swissprot"): |
|
""" |
|
Analyze the BLAST outputs for each fusion protein against UniProt. |
|
Use the fuson_ht_db to look for the heads and tails that we expect. If they can't be found, ... ? |
|
""" |
|
output_file=f"blast_outputs/{database}_blast_output_analyzed.pkl" |
|
all_seq_ids = fuson_ht_db['seq_id'].unique().tolist() |
|
all_seq_ids = sorted(all_seq_ids, key=lambda x: int(re.search(r'\d+', x).group())) |
|
|
|
prior_results = {} |
|
if os.path.exists(output_file): |
|
with open(output_file, "rb") as f: |
|
prior_results = pickle.load(f) |
|
|
|
|
|
total_parse_time = 0 |
|
tot_seqs_processed = 0 |
|
for seq_id in all_seq_ids: |
|
try: |
|
tot_seqs_processed+=1 |
|
|
|
if seq_id in prior_results: |
|
log_update(f"\tAlready processed {seq_id} blast results. Continuing") |
|
continue |
|
|
|
file_path = f"blast_outputs/{database}/{seq_id}_{database}_results.out" |
|
|
|
aa_seq = fuson_ht_db.loc[ |
|
fuson_ht_db['seq_id']==seq_id |
|
]['aa_seq'].tolist()[0] |
|
|
|
|
|
fusion_genes = fuson_ht_db.loc[ |
|
fuson_ht_db['seq_id']==seq_id |
|
]['fusiongenes'].tolist() |
|
|
|
|
|
head_ids = fuson_ht_db.loc[ |
|
fuson_ht_db['seq_id']==seq_id |
|
]['hgUniProt'].dropna().tolist() |
|
head_reviewed, head_reviewed_dict = "", {} |
|
if len(head_ids)>0: |
|
head_ids = ",".join(head_ids).split(",") |
|
head_reviewed = fuson_ht_db.loc[ |
|
fuson_ht_db['seq_id']==seq_id |
|
]['hgUniProtReviewed'].dropna().tolist() |
|
head_reviewed = list("".join(head_reviewed)) |
|
|
|
head_reviewed_dict = dict(zip(head_ids, head_reviewed)) |
|
head_ids = list(head_reviewed_dict.keys()) |
|
head_reviewed = list(head_reviewed_dict.values()) |
|
|
|
head_genes = fuson_ht_db.loc[ |
|
fuson_ht_db['seq_id']==seq_id |
|
]['hgene'].unique().tolist() |
|
|
|
|
|
tail_ids = fuson_ht_db.loc[ |
|
fuson_ht_db['seq_id']==seq_id |
|
]['tgUniProt'].dropna().tolist() |
|
tail_reviewed, tail_reviewed_dict = "", {} |
|
if len(tail_ids)>0: |
|
tail_ids = ",".join(tail_ids).split(",") |
|
tail_reviewed = fuson_ht_db.loc[ |
|
fuson_ht_db['seq_id']==seq_id |
|
]['tgUniProtReviewed'].dropna().tolist() |
|
tail_reviewed = list("".join(tail_reviewed)) |
|
|
|
tail_reviewed_dict = dict(zip(tail_ids, tail_reviewed)) |
|
tail_ids = list(tail_reviewed_dict.keys()) |
|
tail_reviewed = list(tail_reviewed_dict.values()) |
|
|
|
tail_genes = fuson_ht_db.loc[ |
|
fuson_ht_db['seq_id']==seq_id |
|
]['tgene'].unique().tolist() |
|
|
|
|
|
log_update(f"\tEvaluating {seq_id}, fusion genes = {fusion_genes}, len = {len(aa_seq)}...\n\t\tfile_path={file_path}") |
|
|
|
|
|
|
|
parse_start_time = time.time() |
|
blast_data = parse_blast_output(file_path, head_ids, tail_ids) |
|
parse_end_time = time.time() |
|
parse_seq_time = parse_end_time-parse_start_time |
|
total_parse_time+=parse_seq_time |
|
log_update(f"\t\tBLAST output analysis completed for {seq_id} ({parse_seq_time:.2f}s)") |
|
|
|
|
|
|
|
n_og_reviewed_head_ids = len([x for x in head_reviewed if x=='1']) |
|
found_head_ids = [x for x in list(blast_data.keys()) if (blast_data[x] is not None) and (blast_data[x].get('H_or_T',None) in ['Head','Head,Tail'])] |
|
n_found_reviewed_head_ids = len([x for x in found_head_ids if head_reviewed_dict[x]=='1']) |
|
|
|
n_og_reviewed_tail_ids = len([x for x in tail_reviewed if x=='1']) |
|
found_tail_ids = [x for x in list(blast_data.keys()) if (blast_data[x] is not None) and (blast_data[x].get('H_or_T',None) in ['Tail','Head,Tail'])] |
|
n_found_reviewed_tail_ids = len([x for x in found_tail_ids if tail_reviewed_dict[x]=='1']) |
|
|
|
|
|
|
|
|
|
|
|
to_pickle_dict = {seq_id: blast_data} |
|
with open(output_file, 'ab+') as f: |
|
pickle.dump(to_pickle_dict, f) |
|
|
|
except: |
|
log_update(f"{seq_id} failed") |
|
|
|
redump_pickle_dictionary(output_file) |
|
|
|
|
|
log_update(f"\tFinished processing {tot_seqs_processed} sequences ({total_parse_time:.2f}s)") |
|
|
|
|
|
redump_pickle_dictionary(output_file) |
|
|
|
def analyze_blast_results(fuson_ht_db, database="swissprot"): |
|
blast_results_path=f"blast_outputs/{database}_blast_output_analyzed.pkl" |
|
stats_df_savepath = f"blast_outputs/{database}_blast_stats.csv" |
|
top_alignments_df_savepath = f"blast_outputs/{database}_top_alignments.csv" |
|
|
|
stats_df, top_alignments_df = None, None |
|
if os.path.exists(stats_df_savepath) and os.path.exists(top_alignments_df_savepath): |
|
stats_df = pd.read_csv(stats_df_savepath) |
|
top_alignments_df = pd.read_csv(top_alignments_df_savepath, dtype={'top_hg_UniProt_isoform':'str', |
|
'top_tg_UniProt_isoform': 'str', |
|
'top_UniProt_isoform': 'str'}) |
|
|
|
else: |
|
with open(blast_results_path, "rb") as f: |
|
results = pickle.load(f) |
|
|
|
|
|
|
|
seqid_stats = {} |
|
top_alignments_dict = {} |
|
for seq_id in list(results.keys()): |
|
seqid_stats[seq_id] = { |
|
'hgAlignments': 0, |
|
'tgAlignments': 0, |
|
'totalAlignments': 0, |
|
'best_hgScore': 0, |
|
'best_tgScore': 0, |
|
'best_Score': 0 |
|
} |
|
top_alignments_dict[seq_id] = { |
|
'top_hg_UniProtID': None, |
|
'top_hg_UniProt_isoform': None, |
|
'top_hg_UniProt_fus_indices': None, |
|
'top_tg_UniProtID': None, |
|
'top_tg_UniProt_isoform': None, |
|
'top_tg_UniProt_fus_indices': None, |
|
'top_UniProtID': None, |
|
'top_UniProt_isoform': None, |
|
'top_UniProt_fus_indices': None |
|
} |
|
for uniprot, d in results[seq_id].items(): |
|
if not(d is None): |
|
isoform = d['Isoform'] |
|
|
|
query_start = d['Query_Start'] |
|
if (query_start is None) or (type(query_start)==float and np.isnan(query_start)): |
|
query_start = '' |
|
else: |
|
query_start = int(query_start) |
|
query_end = d['Query_End'] |
|
if (query_end is None) or (type(query_end)==float and np.isnan(query_end)): |
|
query_end = '' |
|
else: |
|
query_end = int(query_end) |
|
fus_indices = f"{query_start},{query_end}".strip(",") |
|
|
|
if d['H_or_T'] in ['Head', 'Head,Tail']: |
|
seqid_stats[seq_id]['hgAlignments'] +=1 |
|
if d['Score'] > seqid_stats[seq_id]['best_hgScore']: |
|
seqid_stats[seq_id]['best_hgScore'] = d['Score'] |
|
if type(uniprot)==float or uniprot is None: |
|
top_alignments_dict[seq_id]['top_hg_UniProtID'] = '' |
|
else: |
|
top_alignments_dict[seq_id]['top_hg_UniProtID'] = uniprot |
|
if (type(isoform)==float and np.isnan(isoform)) or isoform is None: |
|
top_alignments_dict[seq_id]['top_hg_UniProt_isoform'] = '' |
|
else: |
|
top_alignments_dict[seq_id]['top_hg_UniProt_isoform'] = str(int(isoform)) |
|
|
|
top_alignments_dict[seq_id]['top_hg_UniProt_fus_indices'] = fus_indices |
|
|
|
if d['H_or_T'] in ['Tail','Head,Tail']: |
|
seqid_stats[seq_id]['tgAlignments'] +=1 |
|
if d['Score'] > seqid_stats[seq_id]['best_tgScore']: |
|
seqid_stats[seq_id]['best_tgScore'] = d['Score'] |
|
if type(uniprot)==float or uniprot is None: |
|
top_alignments_dict[seq_id]['top_tg_UniProtID'] = '' |
|
else: |
|
top_alignments_dict[seq_id]['top_tg_UniProtID'] = uniprot |
|
if (type(isoform)==float and np.isnan(isoform)) or isoform is None: |
|
top_alignments_dict[seq_id]['top_tg_UniProt_isoform'] = '' |
|
else: |
|
top_alignments_dict[seq_id]['top_tg_UniProt_isoform'] = str(int(isoform)) |
|
|
|
top_alignments_dict[seq_id]['top_tg_UniProt_fus_indices'] = fus_indices |
|
|
|
seqid_stats[seq_id]['totalAlignments']+=1 |
|
|
|
if d['Best']==True: |
|
seqid_stats[seq_id]['best_Score'] = d['Score'] |
|
if type(uniprot)==float or uniprot is None: |
|
top_alignments_dict[seq_id]['top_UniProtID'] = '' |
|
else: |
|
top_alignments_dict[seq_id]['top_UniProtID'] = uniprot |
|
if (type(isoform)==float and np.isnan(isoform)) or isoform is None: |
|
top_alignments_dict[seq_id]['top_UniProt_isoform'] = '' |
|
else: |
|
top_alignments_dict[seq_id]['top_UniProt_isoform'] = str(int(isoform)) |
|
|
|
top_alignments_dict[seq_id]['top_UniProt_fus_indices'] = fus_indices |
|
|
|
if 'Identities' not in d: print(seq_id, uniprot, d.keys()) |
|
identities = d['Identities'] |
|
identities = int(identities.split('/')[0]) |
|
positives = d['Positives'] |
|
positives = int(positives.split('/')[0]) |
|
top_alignments_dict[seq_id]['top_UniProt_nIdentities'] = identities |
|
top_alignments_dict[seq_id]['top_UniProt_nPositives'] = positives |
|
|
|
|
|
stats_df = pd.DataFrame.from_dict(seqid_stats, orient='index').reset_index().rename(columns={'index':'seq_id'}) |
|
stats_df['h_or_t_alignment'] = stats_df.apply(lambda row: True if (row['hgAlignments']>0 or row['tgAlignments']>0) else False, axis=1) |
|
stats_df['h_and_t_alignment'] = stats_df.apply(lambda row: True if (row['hgAlignments']>0 and row['tgAlignments']>0) else False, axis=1) |
|
stats_df.to_csv(stats_df_savepath,index=False) |
|
|
|
top_alignments_df = pd.DataFrame.from_dict(top_alignments_dict, orient='index').reset_index().rename(columns={'index':'seq_id'}) |
|
|
|
fusion_id_seq_dict = dict(zip(fuson_ht_db['seq_id'],fuson_ht_db['aa_seq'])) |
|
assert len(fusion_id_seq_dict) == len(fuson_ht_db['seq_id'].unique()) == len(fuson_ht_db['aa_seq'].unique()) |
|
top_alignments_df['aa_seq_len'] = top_alignments_df['seq_id'].map(fusion_id_seq_dict).str.len() |
|
|
|
top_alignments_df.to_csv(top_alignments_df_savepath,index=False) |
|
|
|
|
|
no_match_list1 = find_nomatch_blasts(fuson_ht_db, database=database) |
|
|
|
log_update(stats_df.head(10).to_string()) |
|
|
|
log_update(f"Total sequences: {len(stats_df)}") |
|
log_update(f"Sequences with >=1 head alignment: {len(stats_df.loc[stats_df['hgAlignments']>0])}") |
|
log_update(f"Sequences with >=1 tail alignment: {len(stats_df.loc[stats_df['tgAlignments']>0])}") |
|
log_update(f"Sequences with >=1 head OR tail alignment: {len(stats_df.loc[stats_df['h_or_t_alignment']])}") |
|
log_update(f"Sequences with >=1 head AND tail alignment: {len(stats_df.loc[stats_df['h_and_t_alignment']])}") |
|
log_update(f"Sequences with ANY alignment: {len(stats_df.loc[stats_df['totalAlignments']>0])}") |
|
|
|
top_alignments_df = top_alignments_df.replace({None: ''}) |
|
log_update(f"Preview of top alignments for {database} search:\n{top_alignments_df.head(10).to_string(index=False)}") |
|
top_alignments_df['hiso'] = top_alignments_df['top_hg_UniProtID']+'-'+top_alignments_df['top_hg_UniProt_isoform'] |
|
top_alignments_df['tiso'] = top_alignments_df['top_tg_UniProtID']+'-'+top_alignments_df['top_tg_UniProt_isoform'] |
|
top_alignments_df['biso'] = top_alignments_df['top_UniProtID']+'-'+top_alignments_df['top_UniProt_isoform'] |
|
top_hgs = set([x.strip('-') for x in top_alignments_df['hiso'].tolist()]) |
|
top_tgs = set([x.strip('-') for x in top_alignments_df['tiso'].tolist()]) |
|
top_bgs = set([x.strip('-') for x in top_alignments_df['biso'].tolist()]) |
|
top_gs = top_hgs | top_tgs | top_bgs |
|
log_update(f"\nTotal unique head proteins (including isoform) producing top head alignments: {len(top_hgs)}") |
|
log_update(f"\nTotal unique tail proteins (including isoform) producing top tail alignments: {len(top_tgs)}") |
|
log_update(f"\nTotal unique proteins (including isoform) - head, tail, or neither - producing top alignments: {len(top_gs)}") |
|
|
|
|
|
|
|
return stats_df, top_alignments_df |
|
|
|
def compare_database_blasts(fuson_ht_db, swissprot_blast_stats, fusion_hts_blast_stats, make_new_plots=True): |
|
|
|
|
|
|
|
|
|
og_cols = list(swissprot_blast_stats.columns)[1::] |
|
for c in og_cols: |
|
if c!='seq_id': |
|
swissprot_blast_stats = swissprot_blast_stats.rename(columns={c: f"swiss_{c}"}) |
|
for c in og_cols: |
|
if c!='seq_id': |
|
fusion_hts_blast_stats = fusion_hts_blast_stats.rename(columns={c: f"hts_{c}"}) |
|
|
|
|
|
merged = pd.merge(swissprot_blast_stats, |
|
fusion_hts_blast_stats, |
|
on='seq_id', |
|
how='outer') |
|
diff_cols = og_cols[0:-2] |
|
differences = pd.DataFrame(columns=diff_cols) |
|
log_update(f"Making volcano plots of the differences between fusion head-tail BLAST and swissprot BLAST in the following columns:\n\t{','.join(diff_cols)}") |
|
for c in diff_cols: |
|
differences[c] = merged[f"hts_{c}"] - merged[f"swiss_{c}"] |
|
|
|
|
|
|
|
if make_new_plots: |
|
os.makedirs("figures",exist_ok=True) |
|
os.makedirs("figures/database_comparison",exist_ok=True) |
|
os.makedirs("figures/database_comparison/differences",exist_ok=True) |
|
os.makedirs("figures/database_comparison/values",exist_ok=True) |
|
os.makedirs("figures/database_comparison/box",exist_ok=True) |
|
|
|
group_difference_plot(differences) |
|
group_swiss_and_ht_plot(merged.drop(columns=['seq_id']), diff_cols) |
|
group_box_plot(merged.drop(columns=['seq_id']), diff_cols) |
|
|
|
def fasta_to_dataframe(fasta_file): |
|
|
|
df = pd.read_fwf(fasta_file, header=None, colspecs=[(0, None)], names=['content']) |
|
|
|
|
|
ids = df.iloc[::2].reset_index(drop=True) |
|
sequences = df.iloc[1::2].reset_index(drop=True) |
|
|
|
|
|
fasta_df = pd.DataFrame({'ID': ids['content'], 'Sequence': sequences['content']}) |
|
fasta_df['ID'] = fasta_df['ID'].str.split('>',expand=True)[1] |
|
fasta_df['Sequence'] = fasta_df['Sequence'].str.strip().str.strip('\n') |
|
|
|
|
|
temp = fasta_df.head(10) |
|
temp['Sequence'] = temp['Sequence'].apply(lambda x: x[0:10]+'...') |
|
log_update(f"Preview of head/tail fasta sequences in a dataframe:\n{temp.to_string(index=False)}") |
|
|
|
return fasta_df |
|
|
|
def get_ht_uniprot_query(swissprot_top_alignments_df): |
|
''' |
|
Use swissprot_top_alignments_df to curate all the unique UniProt IDs (ID.Isoform) that created top head and tail alignments |
|
''' |
|
swissprot_top_alignments_df['top_hg_full'] = swissprot_top_alignments_df['top_hg_UniProtID']+'.'+swissprot_top_alignments_df['top_hg_UniProt_isoform'] |
|
swissprot_top_alignments_df['top_tg_full'] = swissprot_top_alignments_df['top_tg_UniProtID']+'.'+swissprot_top_alignments_df['top_tg_UniProt_isoform'] |
|
|
|
unique_heads = swissprot_top_alignments_df.loc[ |
|
swissprot_top_alignments_df['top_hg_UniProtID'].notna() |
|
]['top_hg_full'].unique().tolist() |
|
|
|
unique_tails = swissprot_top_alignments_df.loc[ |
|
swissprot_top_alignments_df['top_tg_UniProtID'].notna() |
|
]['top_tg_full'].unique().tolist() |
|
|
|
unique_ht = set(unique_heads).union(set(unique_tails)) |
|
unique_ht = list(unique_ht) |
|
unique_ht = [x for x in unique_ht if len(x)>1] |
|
|
|
with open("blast_outputs/ht_uniprot_query.txt", "w") as f: |
|
for i, ht in enumerate(unique_ht): |
|
if i!= len(unique_ht)-1: |
|
f.write(f"{ht}\n") |
|
else: |
|
f.write(f"{ht}") |
|
|
|
def main(): |
|
|
|
|
|
with open_logfile(f"fusion_blast_log.txt"): |
|
|
|
prepare_blast_inputs() |
|
|
|
|
|
run_blast("blast_inputs",database="swissprot") |
|
|
|
|
|
|
|
fuson_ht_db = make_fuson_ht_db(savepath="fuson_ht_db.csv") |
|
|
|
|
|
swissprot_blast_stats, swissprot_top_alignments_df = analyze_blast_results(fuson_ht_db,database="swissprot") |
|
|
|
swissprot_top_alignments_df = pd.read_csv("blast_outputs/swissprot_top_alignments.csv") |
|
get_ht_uniprot_query(swissprot_top_alignments_df) |
|
os.makedirs("figures/top_blast_visuals",exist_ok=True) |
|
group_pos_id_plot(swissprot_top_alignments_df) |
|
|
|
if __name__ == '__main__': |
|
main() |
|
|
|
|