FusOn-pLM / fuson_plm /data /blast /blast_fusions.py

root

uploading data folder

1e6a1f0 8 months ago

44.5 kB

	### Prepare to BLAST all of our sequences against UniProt
	import pandas as pd
	import os
	import subprocess
	import time
	import re
	import pickle
	import numpy as np

	from fuson_plm.utils.logging import log_update, open_logfile
	from fuson_plm.utils.embedding import redump_pickle_dictionary
	from fuson_plm.data.blast.plot import group_difference_plot, group_swiss_and_ht_plot, group_box_plot, group_pos_id_plot

	def prepare_blast_inputs():
	log_update("\nPreparing BLAST Inputs. Logging every 1000 sequences... ")
	# make directory for input and output
	os.makedirs("blast_inputs", exist_ok=True)

	# read the fuson database
	fuson_db = pd.read_csv('../fuson_db.csv')

	# make dictionary mapping sequences to seqids (for naming input filess)
	fuson_db_dict = dict(zip(fuson_db['aa_seq'],fuson_db['seq_id']))

	# convert the database into fasta format
	new_fa_files_created = 0
	old_fa_files_found = 0
	total_seqs_processed=0
	for i, (seq, seqid) in enumerate(fuson_db_dict.items()):
	total_seqs_processed+=1
	# if the path already exists, skip
	if os.path.exists(f"blast_inputs/{seqid}.fa"):
	old_fa_files_found+=1
	else:
	new_fa_files_created+=1
	with open(f"blast_inputs/{seqid}.txt", 'w') as f:
	fasta_lines = '>' + seqid + '\n' + seq
	f.write(fasta_lines)
	# rename it to .fa
	os.rename(f"blast_inputs/{seqid}.txt", f"blast_inputs/{seqid}.fa")

	if i%1000==0:
	log_update(f"\t\t{i}\t{seqid}:{seq}")

	log_update("\tFinished preparing BLAST Inputs (results in blast_inputs folder)")
	log_update(f"\t\tSequences processed: {total_seqs_processed}/{len(fuson_db)} seqs in FusOn-DB\n\t\tFasta files found: {old_fa_files_found}\n\t\tNew fasta files created: {new_fa_files_created}")

	def run_blast(blast_inputs_dir, database="swissprot",n=1,interval=2000):
	"""
	Run BLAST on all files in blast_inputs_dir
	"""
	# Must change the PATH variable to include the BLAST executables
	os.environ['PATH'] += ":./ncbi-blast-2.16.0+/bin"
	os.environ['BLASTDB'] = f"ncbi-blast-2.16.0+/{database}"

	# make directory for outputs
	os.makedirs("blast_outputs", exist_ok=True)
	os.makedirs(f"blast_outputs/{database}", exist_ok=True)
	already_blasted = os.listdir(f"blast_outputs/{database}")
	blast_input_files = os.listdir(blast_inputs_dir)
	# Sort the list using a custom key to extract the numeric part
	blast_input_files = sorted(blast_input_files, key=lambda x: int(re.search(r'\d+', x).group()))

	# print how many we've already blasted
	log_update(f"Running BLAST.\n\t{len(blast_input_files)} input files\n\t{len(already_blasted)} already blasted\n")

	tot_seqs_processed = 0
	total_blast_time = 0

	start_i = interval*(n-1)
	end_i = interval*n
	if end_i>len(blast_input_files): end_i = len(blast_input_files)
	for i, blast_input_file in enumerate(blast_input_files[start_i:end_i]):
	tot_seqs_processed+=1
	# blast_input_file is of the format seqid.fa
	seqid = blast_input_file.split('.fa')[0]
	input_path = f"blast_inputs/{blast_input_file}"
	output_path = f"blast_outputs/{database}/{seqid}_{database}_results.out"

	if os.path.exists(output_path):
	log_update(f"\t{i+1}.\tAlready blasted {seqid}")
	continue

	# Construct the command as a list of arguments
	command = [
	"ncbi-blast-2.16.0+/bin/blastp",
	"-db", database,
	"-query", input_path,
	"-out", output_path
	]

	# Run the command, and time it
	blast_start_time = time.time()
	result = subprocess.run(command, capture_output=True, text=True)
	blast_end_time = time.time()
	blast_seq_time = blast_end_time-blast_start_time
	total_blast_time+=blast_seq_time

	# Check if there was an error
	if result.returncode != 0:
	log_update(f"\t{i+1}.\tError running BLAST for {seqid}: {result.stderr} ({blast_seq_time:.2f}s)")
	else:
	log_update(f"\t{i+1}.\tBLAST search completed for {seqid} ({blast_seq_time:.2f}s)")

	log_update(f"\tFinished processing {tot_seqs_processed} sequences ({total_blast_time:.2f}s)")

	def remove_incomplete_blasts(database="swissprot"):
	incomplete_list = []
	for fname in os.listdir(f"blast_outputs/{database}"):
	complete=False
	with open(f"blast_outputs/{database}/{fname}", "r") as f:
	lines = f.readlines()
	if len(lines)>1 and "Window for multiple hits:" in lines[-1]:
	complete=True
	if not complete:
	incomplete_list.append(fname)

	log_update(f"\t{len(incomplete_list)} BLAST files are incomplete (due to BLAST errors). Deleting them. Rerun these")
	# remove all these files
	for fname in incomplete_list:
	os.remove(f"blast_outputs/{database}/{fname}")

	def find_nomatch_blasts(fuson_ht_db, database="swissprot"):
	no_match_list = []
	for fname in os.listdir(f"blast_outputs/{database}"):
	match=True
	with open(f"blast_outputs/{database}/{fname}", "r") as f:
	lines = f.readlines()
	if len(lines)>1 and "No hits found" in lines[28]: # it'll say no hits found if there are no hits
	match=False
	if not match:
	no_match_list.append(fname)

	log_update(f"\t{len(no_match_list)} sequence IDs had no match in the BLAST database {database}")
	# write no match list to a file in blast_outputs
	with open(f"blast_outputs/{database}_no_match.txt","w") as f:
	for i, fname in enumerate(no_match_list):
	if i!=len(no_match_list)-1:
	f.write(f"{fname}\n")
	else:
	f.write(f"{fname}")

	# write a subset of fuson_ht_db containing these sequences as well
	no_match_ids = [x.split('_')[0] for x in no_match_list]
	subset = fuson_ht_db.loc[
	fuson_ht_db['seq_id'].isin(no_match_ids)
	].reset_index(drop=True)
	subset.to_csv(f"blast_outputs/{database}_no_match.csv",index=False)

	return no_match_ids

	def make_fuson_ht_db(path_to_fuson_db="../fuson_db.csv", path_to_unimap="../head_tail_data/htgenes_uniprotids.csv",savepath="fuson_ht_db.csv"):
	"""
	Make a version of the fuson_db that has all the heads and tails for each of the genes. Will make it easier to analyze blast results
	"""
	if os.path.exists(savepath):
	df = pd.read_csv(savepath)
	return df

	# read both of teh databases
	fuson_db = pd.read_csv(path_to_fuson_db)
	ht_db = pd.read_csv(path_to_unimap)

	# Make it such that each row of fuson_db just has ONE head and ONE tail
	fuson_ht_db = fuson_db.copy(deep=True)
	fuson_ht_db['fusiongenes'] = fuson_ht_db['fusiongenes'].apply(lambda x: x.split(','))
	fuson_ht_db = fuson_ht_db.explode('fusiongenes')
	fuson_ht_db['hgene'] = fuson_ht_db['fusiongenes'].str.split('::',expand=True)[0]
	fuson_ht_db['tgene'] = fuson_ht_db['fusiongenes'].str.split('::',expand=True)[1]

	# Merge on head, then merge on tail
	fuson_ht_db = pd.merge( # merge on head
	fuson_ht_db,
	ht_db.rename(columns={
	'Gene': 'hgene',
	'UniProtID': 'hgUniProt',
	'Reviewed': 'hgUniProtReviewed'
	}),
	on='hgene',
	how='left'
	)
	fuson_ht_db = pd.merge( # merge on tail
	fuson_ht_db,
	ht_db.rename(columns={
	'Gene': 'tgene',
	'UniProtID': 'tgUniProt',
	'Reviewed': 'tgUniProtReviewed'
	}),
	on='tgene',
	how='left'
	)

	# Make sure we haven't lost anything
	tot_og_seqids = len(fuson_db['seq_id'].unique())
	tot_final_seqids = len(fuson_ht_db['seq_id'].unique())
	log_update(f"\tTotal sequence IDs in combined database = {tot_final_seqids}. Matches expected: {tot_final_seqids==tot_og_seqids}")
	# Each fusion should have the same number of ROWS as it does commas+1
	fuson_db['n_commas'] = fuson_db['fusiongenes'].str.count(',') + 1
	seqid_rows_map = dict(zip(fuson_db['seq_id'],fuson_db['n_commas']))
	vc = fuson_ht_db['seq_id'].value_counts().reset_index()
	vc['expected_count'] = vc['index'].map(seqid_rows_map)
	log_update(f"\tEach seq_id has the expected number of head-tail combos: {(vc['expected_count']==vc['seq_id']).all()}")

	log_update(f"\tPreview of combined database:")
	prev = fuson_ht_db.head(10)
	prev['aa_seq'] = prev['aa_seq'].apply(lambda x: x[0:10]+'...')
	log_update(prev.to_string(index=False))
	fuson_ht_db.to_csv(savepath, index=False)
	return fuson_ht_db

	def format_dict(d, indent=0):
	"""
	Recursively formats a dictionary for display purposes.

	Args:
	d (dict): The dictionary to format.
	indent (int): The current level of indentation.

	Returns:
	str: A formatted string representing the dictionary.
	"""
	formatted_str = ""
	# Iterate through each key-value pair in the dictionary
	for key, value in d.items():
	# Create the current indentation
	current_indent = " " * (indent * 4)
	# Add the key
	formatted_str += f"{current_indent}{repr(key)}: "

	# Check the type of the value
	if isinstance(value, dict):
	# If dictionary, call format_dict recursively
	formatted_str += "{\n" + format_dict(value, indent + 1) + current_indent + "},\n"
	elif isinstance(value, list):
	# If list, convert it to a formatted string
	formatted_str += f"[{', '.join(repr(item) for item in value)}],\n"
	elif isinstance(value, str):
	# If string, enclose in quotes
	formatted_str += f"'{value}',\n"
	elif value is None:
	# If None, display as 'None'
	formatted_str += "None,\n"
	else:
	formatted_str += f"{repr(value)},\n"

	return formatted_str

	def parse_blast_output(file_path, head_ids, tail_ids):
	"""
	Args:
	- file_path: /path/to/blast/output
	- head_ids: list of all UniProt IDs for the head protien
	- tail_ids: list of all UniProt IDs for the tail protein
	"""
	target_ids = list(set(head_ids + tail_ids)) # make a list to make some functions easier
	with open(file_path, 'r') as file:
	best_data = {tid: None for tid in target_ids} # stores the best alignment for each ID we care about
	current_data = {tid: {} for tid in target_ids} # stores the current data for each ID we care about (most recent alignment we read)
	best_score = {tid: -float('inf') for tid in target_ids} # stores the best score for each ID we care about
	capture = {tid: False for tid in target_ids} # whether we are currently processing this ID
	replace_best = {tid: False for tid in target_ids} # whether we should replace the best_data with the current_data for this ID
	isoform_dict = {tid: None for tid in target_ids} # dictionary of isoforms for

	# variables that will only be used for getting the best alignment
	alignment_count = 0
	cur_id = None
	on_best_alignment=False

	# Iterate through lines
	for line in file:
	line = line.strip()
	# if NEW ID (not necessarily new alignment! can be multiple alignmetns under one >)
	if line.startswith('>'):
	found_tid_in_header=False # assume we have not found a target ID we are looking for
	alignment_count+=1
	if alignment_count==1: # we're on the best alignment because this is the one that's listed first! it should be
	on_best_alignment=True
	else:
	on_best_alignment = False

	## We may have just finisehd processing an ID. Check for the one who currently has capture set to true
	just_captured = None
	total_captured = 0
	for k, v in capture.items():
	if v:
	total_captured+=1
	just_captured = k
	# we should never be capturing more than one thing at a time. make sure of this
	assert total_captured<2
	if just_captured is not None:
	if replace_best[just_captured]: # if we just finished an alignment for the just_captured ID, and it's the best one, put it in
	best_data[just_captured] = current_data[just_captured].copy()
	replace_best[just_captured] = False # we just did the replacement, so reset it

	# Check if the line contains any of the target IDs.
	# This means EITHER [UniProtID] or [UniProtID.Isoform] or [UniProtID-Isoform] is in the line
	for tid in target_ids:
	pattern = fr">{tid}([.-]\d+)? " # for ID P02671, would match ">P02671 ", ">P02671.2 " and ">P02671-2 "
	if re.search(pattern, line): # if this ID matches
	isoform_dict[tid] = None # set it to None, update it if we need to
	if "." in line: # look for isoform denoted by . if there is one, otherwise it'll stay as None
	isoform = int(line.split(".")[1].split(" ")[0])
	isoform_dict[tid] = isoform
	#print(f"\t\tID = {tid} (is a head or tail), isoform={isoform}")
	elif "-" in line: # look for isoform denoted by - if there is one, otherwise it'll stay as None
	isoform = int(line.split("-")[1].split(" ")[0])
	isoform_dict[tid] = isoform
	#print(f"\t\tID = {tid} (is a head or tail), isoform={isoform}")
	capture[tid] = True
	current_data[tid] = {'header': line}
	found_tid_in_header=True # we've found the tid that's in this line, so no need to check theothers
	else:
	capture[tid] = False

	if on_best_alignment: # if this is the best alignment
	if not(found_tid_in_header): # if none of our TIDs are it
	cur_id_full = line.split('>')[1].split(' ')[0]
	cur_id, isoform = cur_id_full, None
	isoform_dict[cur_id] = None # change this if we need
	if "." in cur_id_full: # if there's a dot, it's an isoform.
	cur_id = cur_id_full.split(".")[0]
	isoform = int(cur_id_full.split(".")[1])
	isoform_dict[cur_id] = isoform
	#log_update(f"\t\tID = {cur_id} (best alignment, not a head or tail), isoform={isoform}")
	#log_update(f"\t\t\tFull line: {line}") # so we can see the gene name. does it make sense?
	elif "-" in cur_id_full: # if there's a -, it's an isoform.
	cur_id = cur_id_full.split("-")[0]
	isoform = int(cur_id_full.split("-")[1])
	isoform_dict[cur_id] = isoform
	#log_update(f"\t\tID = {cur_id} (best alignment, not a head or tail), isoform={isoform}")
	#log_update(f"\t\t\tFull line: {line}") # so we can see the gene name. does it make sense?
	# add this id to all the dictionaries
	best_data[cur_id] = None
	current_data[cur_id] = {}
	best_score[cur_id] = -float('inf')
	capture[cur_id] = False
	replace_best[cur_id] = False


	for tid in target_ids:
	if capture[tid]: # if we're currently on an alignment for a tid we care about
	if 'Score =' in line:
	if replace_best[tid]: # if we're replacing the best alignment with this one, within the same ID, do it
	best_data[tid] = current_data[tid].copy()
	# now reset the variable!
	replace_best[tid] = False

	score_value = float(line.split()[2]) # Assuming "Score = 1053 bits (2723)" format
	current_data[tid] = {} # Reset current_data for this ID
	current_data[tid]['Isoform'] = isoform_dict[tid]
	current_data[tid]['Score'] = score_value
	current_data[tid]['Expect'] = line.split('Expect =')[1].split(', Method')[0].strip()
	current_data[tid]['Query_Aligned'] = []
	current_data[tid]['Subject_Aligned'] = []
	# Set the ID as a head or tail, or neither (neither shouldn't happen here though)
	if tid in head_ids:
	current_data[tid]['H_or_T'] = 'Head'
	if tid in tail_ids:
	current_data[tid]['H_or_T'] = 'Head,Tail'
	elif tid in tail_ids:
	current_data[tid]['H_or_T'] = 'Tail'
	else:
	current_data[tid]['H_or_T'] = np.nan

	current_data[tid]['Best'] = True if on_best_alignment else False
	if score_value > best_score[tid]: # if this is the best score we have for an alignment of this protein
	best_score[tid] = score_value
	replace_best[tid] = True
	else:
	replace_best[tid] = False

	if 'Identities =' in line:
	idents = line.split(', ')
	current_data[tid]['Identities'] = idents[0].split('=')[1].strip()
	current_data[tid]['Positives'] = idents[1].split('=')[1].strip()
	current_data[tid]['Gaps'] = idents[2].split('=')[1].strip()
	if line.startswith('Query'):
	parts = line.split()
	if 'Query_Start' not in current_data[tid]:
	current_data[tid]['Query_Start'] = int(parts[1])
	current_data[tid]['Query_End'] = int(parts[3])
	current_data[tid]['Query_Aligned'].append(parts[2])
	if line.startswith('Sbjct'):
	parts = line.split()
	if 'Sbjct_Start' not in current_data[tid]:
	current_data[tid]['Sbjct_Start'] = int(parts[1])
	current_data[tid]['Sbjct_End'] = int(parts[3])
	current_data[tid]['Subject_Aligned'].append(parts[2])

	# if we're on the best alignment and it's not one of our target_ids, still process it the same way
	if on_best_alignment:
	if not(found_tid_in_header):
	if 'Score =' in line:
	if replace_best[cur_id]: # if we're replacing the best alignment with this one, within the same ID, do it
	best_data[cur_id] = current_data[cur_id].copy()
	# now reset the variable!
	replace_best[cur_id] = False

	score_value = float(line.split()[2]) # Assuming "Score = 1053 bits (2723)" format
	current_data[cur_id] = {} # Reset current_data for this ID
	current_data[cur_id]['Isoform'] = isoform_dict[cur_id]
	current_data[cur_id]['Score'] = score_value
	current_data[cur_id]['Expect'] = line.split('Expect =')[1].split(', Method')[0].strip()
	current_data[cur_id]['Query_Aligned'] = []
	current_data[cur_id]['Subject_Aligned'] = []
	# Set the ID as a head or tail, or neither
	if cur_id in head_ids:
	current_data[cur_id]['H_or_T'] = 'Head'
	if cur_id in tail_ids:
	current_data[cur_id]['H_or_T'] = 'Head,Tail'
	elif cur_id in tail_ids:
	current_data[cur_id]['H_or_T'] = 'Tail'
	else:
	current_data[cur_id]['H_or_T'] = np.nan

	current_data[cur_id]['Best'] = True
	if score_value > best_score[cur_id]: # if this is the best score we have for an alignment of this protein
	best_score[cur_id] = score_value
	replace_best[cur_id] = True
	else:
	replace_best[cur_id] = False

	if 'Identities =' in line:
	idents = line.split(', ')
	current_data[cur_id]['Identities'] = idents[0].split('=')[1].strip()
	current_data[cur_id]['Positives'] = idents[1].split('=')[1].strip()
	current_data[cur_id]['Gaps'] = idents[2].split('=')[1].strip()
	if line.startswith('Query'):
	parts = line.split()
	if 'Query_Start' not in current_data[cur_id]:
	current_data[cur_id]['Query_Start'] = int(parts[1])
	current_data[cur_id]['Query_End'] = int(parts[3])
	current_data[cur_id]['Query_Aligned'].append(parts[2])
	if line.startswith('Sbjct'):
	parts = line.split()
	if 'Sbjct_Start' not in current_data[cur_id]:
	current_data[cur_id]['Sbjct_Start'] = int(parts[1])
	current_data[cur_id]['Sbjct_End'] = int(parts[3])
	current_data[cur_id]['Subject_Aligned'].append(parts[2])

	# add cur_id to target_ids if it's not none
	if not(cur_id is None):
	target_ids += [cur_id]

	# Check at the end of the file if the last scores are the best
	for tid in target_ids:
	if replace_best[tid]:
	best_data[tid] = current_data[tid].copy()

	# Combine sequences into single strings for the best data for each ID
	for tid in target_ids:
	#print(tid)
	if best_data[tid]:
	#print(f"there is a best alignment for {tid}")
	#print(f"best: {best_data[tid]}")
	#print(f"current: {current_data[tid]}")
	best_data[tid]['Query_Aligned'] = ''.join(best_data[tid]['Query_Aligned'])
	best_data[tid]['Subject_Aligned'] = ''.join(best_data[tid]['Subject_Aligned'])

	return best_data

	def parse_all_blast_results(fuson_ht_db, database="swissprot"):
	"""
	Analyze the BLAST outputs for each fusion protein against UniProt.
	Use the fuson_ht_db to look for the heads and tails that we expect. If they can't be found, ... ?
	"""
	output_file=f"blast_outputs/{database}_blast_output_analyzed.pkl"
	all_seq_ids = fuson_ht_db['seq_id'].unique().tolist()
	all_seq_ids = sorted(all_seq_ids, key=lambda x: int(re.search(r'\d+', x).group())) # sort by the number. seq1, seq2, ...

	prior_results = {}
	if os.path.exists(output_file):
	with open(output_file, "rb") as f:
	prior_results = pickle.load(f)

	# Iterate through seq_ids
	total_parse_time = 0
	tot_seqs_processed = 0
	for seq_id in all_seq_ids:
	try:
	tot_seqs_processed+=1
	# If we've already processed it, skip
	if seq_id in prior_results:
	log_update(f"\tAlready processed {seq_id} blast results. Continuing")
	continue

	file_path = f"blast_outputs/{database}/{seq_id}_{database}_results.out"

	aa_seq = fuson_ht_db.loc[
	fuson_ht_db['seq_id']==seq_id
	]['aa_seq'].tolist()[0]

	# Remember, fuson_ht_db has all the IDs for ALL the different head and tail gene identifiers.
	fusion_genes = fuson_ht_db.loc[
	fuson_ht_db['seq_id']==seq_id
	]['fusiongenes'].tolist()

	##### Process heads
	head_ids = fuson_ht_db.loc[
	fuson_ht_db['seq_id']==seq_id
	]['hgUniProt'].dropna().tolist()
	head_reviewed, head_reviewed_dict = "", {}
	if len(head_ids)>0: # if we found head IDs, we can process them and figure out if they're reviewed
	head_ids = ",".join(head_ids).split(",")
	head_reviewed = fuson_ht_db.loc[
	fuson_ht_db['seq_id']==seq_id
	]['hgUniProtReviewed'].dropna().tolist()
	head_reviewed = list("".join(head_reviewed))

	head_reviewed_dict = dict(zip(head_ids, head_reviewed))
	head_ids = list(head_reviewed_dict.keys()) # there may be some duplicates, so separate them out again
	head_reviewed = list(head_reviewed_dict.values())

	head_genes = fuson_ht_db.loc[
	fuson_ht_db['seq_id']==seq_id
	]['hgene'].unique().tolist()

	##### Process tails - same logic
	tail_ids = fuson_ht_db.loc[
	fuson_ht_db['seq_id']==seq_id
	]['tgUniProt'].dropna().tolist()
	tail_reviewed, tail_reviewed_dict = "", {}
	if len(tail_ids)>0: # if we found tail IDs, we can process them and figure out if they're reviewed
	tail_ids = ",".join(tail_ids).split(",")
	tail_reviewed = fuson_ht_db.loc[
	fuson_ht_db['seq_id']==seq_id
	]['tgUniProtReviewed'].dropna().tolist()
	tail_reviewed = list("".join(tail_reviewed))

	tail_reviewed_dict = dict(zip(tail_ids, tail_reviewed))
	tail_ids = list(tail_reviewed_dict.keys()) # there may be some duplicates, so separate them out again
	tail_reviewed = list(tail_reviewed_dict.values())

	tail_genes = fuson_ht_db.loc[
	fuson_ht_db['seq_id']==seq_id
	]['tgene'].unique().tolist()

	###### Log what we just found
	log_update(f"\tEvaluating {seq_id}, fusion genes = {fusion_genes}, len = {len(aa_seq)}...\n\t\tfile_path={file_path}")
	#log_update(f"\n\t\thead genes={head_genes}\n\t\thead_ids={head_ids}\n\t\ttail genes={tail_genes}\n\t\ttail_ids={tail_ids}")

	### Do the analysis and time it
	parse_start_time = time.time() # time it
	blast_data = parse_blast_output(file_path, head_ids, tail_ids)
	parse_end_time = time.time()
	parse_seq_time = parse_end_time-parse_start_time
	total_parse_time+=parse_seq_time
	log_update(f"\t\tBLAST output analysis completed for {seq_id} ({parse_seq_time:.2f}s)")

	# Give preview of results. Logging the whole dict would be too much, so let's just see what we found
	#log_update(format_dict(blast_data,indent=3))
	n_og_reviewed_head_ids = len([x for x in head_reviewed if x=='1'])
	found_head_ids = [x for x in list(blast_data.keys()) if (blast_data[x] is not None) and (blast_data[x].get('H_or_T',None) in ['Head','Head,Tail'])]
	n_found_reviewed_head_ids = len([x for x in found_head_ids if head_reviewed_dict[x]=='1'])

	n_og_reviewed_tail_ids = len([x for x in tail_reviewed if x=='1'])
	found_tail_ids = [x for x in list(blast_data.keys()) if (blast_data[x] is not None) and (blast_data[x].get('H_or_T',None) in ['Tail','Head,Tail'])]
	n_found_reviewed_tail_ids = len([x for x in found_tail_ids if tail_reviewed_dict[x]=='1'])

	#log_update(f"\t\t{len(found_head_ids)}/{len(head_ids)} head protein UniProt IDs ({n_found_reviewed_head_ids}/{n_og_reviewed_head_ids} REVIEWED heads) had alignments")
	#log_update(f"\t\t{len(found_tail_ids)}/{len(tail_ids)} tail protein UniProt IDs ({n_found_reviewed_tail_ids}/{n_og_reviewed_tail_ids} REVIEWED tails) had alignments")

	# write results to pickle file
	to_pickle_dict = {seq_id: blast_data}
	with open(output_file, 'ab+') as f:
	pickle.dump(to_pickle_dict, f)

	except:
	log_update(f"{seq_id} failed")
	# redump the pickle even if we hit an error, so that we can fix the error and continue processing results
	redump_pickle_dictionary(output_file)

	# Log total time
	log_update(f"\tFinished processing {tot_seqs_processed} sequences ({total_parse_time:.2f}s)")

	# redump the pickle
	redump_pickle_dictionary(output_file)

	def analyze_blast_results(fuson_ht_db, database="swissprot"):
	blast_results_path=f"blast_outputs/{database}_blast_output_analyzed.pkl"
	stats_df_savepath = f"blast_outputs/{database}_blast_stats.csv"
	top_alignments_df_savepath = f"blast_outputs/{database}_top_alignments.csv"

	stats_df, top_alignments_df = None, None
	if os.path.exists(stats_df_savepath) and os.path.exists(top_alignments_df_savepath):
	stats_df = pd.read_csv(stats_df_savepath)
	top_alignments_df = pd.read_csv(top_alignments_df_savepath, dtype={'top_hg_UniProt_isoform':'str',
	'top_tg_UniProt_isoform': 'str',
	'top_UniProt_isoform': 'str'})

	else:
	with open(blast_results_path, "rb") as f:
	results = pickle.load(f)

	# analyze the results
	# first, basic stats. How many of them have at least one head or tail alignment??
	seqid_stats = {}
	top_alignments_dict = {}
	for seq_id in list(results.keys()):
	seqid_stats[seq_id] = {
	'hgAlignments': 0,
	'tgAlignments': 0,
	'totalAlignments': 0,
	'best_hgScore': 0,
	'best_tgScore': 0,
	'best_Score': 0
	}
	top_alignments_dict[seq_id] = {
	'top_hg_UniProtID': None,
	'top_hg_UniProt_isoform': None,
	'top_hg_UniProt_fus_indices': None,
	'top_tg_UniProtID': None,
	'top_tg_UniProt_isoform': None,
	'top_tg_UniProt_fus_indices': None,
	'top_UniProtID': None,
	'top_UniProt_isoform': None,
	'top_UniProt_fus_indices': None
	}
	for uniprot, d in results[seq_id].items():
	if not(d is None):
	isoform = d['Isoform']
	# set up the indices string
	query_start = d['Query_Start']
	if (query_start is None) or (type(query_start)==float and np.isnan(query_start)):
	query_start = ''
	else:
	query_start = int(query_start)
	query_end = d['Query_End']
	if (query_end is None) or (type(query_end)==float and np.isnan(query_end)):
	query_end = ''
	else:
	query_end = int(query_end)
	fus_indices = f"{query_start},{query_end}".strip(",")

	if d['H_or_T'] in ['Head', 'Head,Tail']:
	seqid_stats[seq_id]['hgAlignments'] +=1
	if d['Score'] > seqid_stats[seq_id]['best_hgScore']:
	seqid_stats[seq_id]['best_hgScore'] = d['Score']
	if type(uniprot)==float or uniprot is None:
	top_alignments_dict[seq_id]['top_hg_UniProtID'] = ''
	else:
	top_alignments_dict[seq_id]['top_hg_UniProtID'] = uniprot
	if (type(isoform)==float and np.isnan(isoform)) or isoform is None:
	top_alignments_dict[seq_id]['top_hg_UniProt_isoform'] = ''
	else:
	top_alignments_dict[seq_id]['top_hg_UniProt_isoform'] = str(int(isoform))

	top_alignments_dict[seq_id]['top_hg_UniProt_fus_indices'] = fus_indices

	if d['H_or_T'] in ['Tail','Head,Tail']:
	seqid_stats[seq_id]['tgAlignments'] +=1
	if d['Score'] > seqid_stats[seq_id]['best_tgScore']:
	seqid_stats[seq_id]['best_tgScore'] = d['Score']
	if type(uniprot)==float or uniprot is None:
	top_alignments_dict[seq_id]['top_tg_UniProtID'] = ''
	else:
	top_alignments_dict[seq_id]['top_tg_UniProtID'] = uniprot
	if (type(isoform)==float and np.isnan(isoform)) or isoform is None:
	top_alignments_dict[seq_id]['top_tg_UniProt_isoform'] = ''
	else:
	top_alignments_dict[seq_id]['top_tg_UniProt_isoform'] = str(int(isoform))

	top_alignments_dict[seq_id]['top_tg_UniProt_fus_indices'] = fus_indices
	# increment total no matter what type of alignment it is
	seqid_stats[seq_id]['totalAlignments']+=1
	#if d['Score'] > seqid_stats[seq_id]['best_Score']:
	if d['Best']==True: # should be indicated if this is the best!!
	seqid_stats[seq_id]['best_Score'] = d['Score']
	if type(uniprot)==float or uniprot is None:
	top_alignments_dict[seq_id]['top_UniProtID'] = ''
	else:
	top_alignments_dict[seq_id]['top_UniProtID'] = uniprot
	if (type(isoform)==float and np.isnan(isoform)) or isoform is None:
	top_alignments_dict[seq_id]['top_UniProt_isoform'] = ''
	else:
	top_alignments_dict[seq_id]['top_UniProt_isoform'] = str(int(isoform))

	top_alignments_dict[seq_id]['top_UniProt_fus_indices'] = fus_indices
	# now get positives and identities
	if 'Identities' not in d: print(seq_id, uniprot, d.keys())
	identities = d['Identities']
	identities = int(identities.split('/')[0])
	positives = d['Positives']
	positives = int(positives.split('/')[0])
	top_alignments_dict[seq_id]['top_UniProt_nIdentities'] = identities
	top_alignments_dict[seq_id]['top_UniProt_nPositives'] = positives


	stats_df = pd.DataFrame.from_dict(seqid_stats, orient='index').reset_index().rename(columns={'index':'seq_id'})
	stats_df['h_or_t_alignment'] = stats_df.apply(lambda row: True if (row['hgAlignments']>0 or row['tgAlignments']>0) else False, axis=1)
	stats_df['h_and_t_alignment'] = stats_df.apply(lambda row: True if (row['hgAlignments']>0 and row['tgAlignments']>0) else False, axis=1)
	stats_df.to_csv(stats_df_savepath,index=False)

	top_alignments_df = pd.DataFrame.from_dict(top_alignments_dict, orient='index').reset_index().rename(columns={'index':'seq_id'})
	# add in the sequence length so we can get percentages
	fusion_id_seq_dict = dict(zip(fuson_ht_db['seq_id'],fuson_ht_db['aa_seq']))
	assert len(fusion_id_seq_dict) == len(fuson_ht_db['seq_id'].unique()) == len(fuson_ht_db['aa_seq'].unique())
	top_alignments_df['aa_seq_len'] = top_alignments_df['seq_id'].map(fusion_id_seq_dict).str.len()

	top_alignments_df.to_csv(top_alignments_df_savepath,index=False)
	# also, find which ones have no match at all
	# does it match?
	no_match_list1 = find_nomatch_blasts(fuson_ht_db, database=database)

	log_update(stats_df.head(10).to_string())
	# how many have at least one head or tail?
	log_update(f"Total sequences: {len(stats_df)}")
	log_update(f"Sequences with >=1 head alignment: {len(stats_df.loc[stats_df['hgAlignments']>0])}")
	log_update(f"Sequences with >=1 tail alignment: {len(stats_df.loc[stats_df['tgAlignments']>0])}")
	log_update(f"Sequences with >=1 head OR tail alignment: {len(stats_df.loc[stats_df['h_or_t_alignment']])}")
	log_update(f"Sequences with >=1 head AND tail alignment: {len(stats_df.loc[stats_df['h_and_t_alignment']])}")
	log_update(f"Sequences with ANY alignment: {len(stats_df.loc[stats_df['totalAlignments']>0])}")

	top_alignments_df = top_alignments_df.replace({None: ''})
	log_update(f"Preview of top alignments for {database} search:\n{top_alignments_df.head(10).to_string(index=False)}")
	top_alignments_df['hiso'] = top_alignments_df['top_hg_UniProtID']+'-'+top_alignments_df['top_hg_UniProt_isoform']
	top_alignments_df['tiso'] = top_alignments_df['top_tg_UniProtID']+'-'+top_alignments_df['top_tg_UniProt_isoform']
	top_alignments_df['biso'] = top_alignments_df['top_UniProtID']+'-'+top_alignments_df['top_UniProt_isoform']
	top_hgs = set([x.strip('-') for x in top_alignments_df['hiso'].tolist()]) # if things don't have isoforms they'll just end in -
	top_tgs = set([x.strip('-') for x in top_alignments_df['tiso'].tolist()])
	top_bgs = set([x.strip('-') for x in top_alignments_df['biso'].tolist()])
	top_gs = top_hgs \| top_tgs \| top_bgs
	log_update(f"\nTotal unique head proteins (including isoform) producing top head alignments: {len(top_hgs)}")
	log_update(f"\nTotal unique tail proteins (including isoform) producing top tail alignments: {len(top_tgs)}")
	log_update(f"\nTotal unique proteins (including isoform) - head, tail, or neither - producing top alignments: {len(top_gs)}")



	return stats_df, top_alignments_df

	def compare_database_blasts(fuson_ht_db, swissprot_blast_stats, fusion_hts_blast_stats, make_new_plots=True):
	# let's start by just returning a list of IDs that were
	# cols = seq_id hgAlignments tgAlignments totalAlignments best_hgScore best_tgScore best_Score h_or_t_alignment h_and_t_alignment

	# distinguish the columns
	og_cols = list(swissprot_blast_stats.columns)[1::]
	for c in og_cols:
	if c!='seq_id':
	swissprot_blast_stats = swissprot_blast_stats.rename(columns={c: f"swiss_{c}"})
	for c in og_cols:
	if c!='seq_id':
	fusion_hts_blast_stats = fusion_hts_blast_stats.rename(columns={c: f"hts_{c}"})

	# merge
	merged = pd.merge(swissprot_blast_stats,
	fusion_hts_blast_stats,
	on='seq_id',
	how='outer')
	diff_cols = og_cols[0:-2]
	differences = pd.DataFrame(columns=diff_cols)
	log_update(f"Making volcano plots of the differences between fusion head-tail BLAST and swissprot BLAST in the following columns:\n\t{','.join(diff_cols)}")
	for c in diff_cols:
	differences[c] = merged[f"hts_{c}"] - merged[f"swiss_{c}"]

	# make some box plots of differences
	# Generate volcano plots for each column
	if make_new_plots:
	os.makedirs("figures",exist_ok=True)
	os.makedirs("figures/database_comparison",exist_ok=True)
	os.makedirs("figures/database_comparison/differences",exist_ok=True)
	os.makedirs("figures/database_comparison/values",exist_ok=True)
	os.makedirs("figures/database_comparison/box",exist_ok=True)

	group_difference_plot(differences)
	group_swiss_and_ht_plot(merged.drop(columns=['seq_id']), diff_cols)
	group_box_plot(merged.drop(columns=['seq_id']), diff_cols)

	def fasta_to_dataframe(fasta_file):
	# Read the file into a DataFrame with a single column
	df = pd.read_fwf(fasta_file, header=None, colspecs=[(0, None)], names=['content'])

	# Select even and odd lines using pandas slicing
	ids = df.iloc[::2].reset_index(drop=True) # Even-indexed lines (IDs)
	sequences = df.iloc[1::2].reset_index(drop=True) # Odd-indexed lines (sequences)

	# Combine into a new DataFrame
	fasta_df = pd.DataFrame({'ID': ids['content'], 'Sequence': sequences['content']})
	fasta_df['ID'] = fasta_df['ID'].str.split('>',expand=True)[1]
	fasta_df['Sequence'] = fasta_df['Sequence'].str.strip().str.strip('\n')

	# print a preview of this
	temp = fasta_df.head(10)
	temp['Sequence'] = temp['Sequence'].apply(lambda x: x[0:10]+'...')
	log_update(f"Preview of head/tail fasta sequences in a dataframe:\n{temp.to_string(index=False)}")

	return fasta_df

	def get_ht_uniprot_query(swissprot_top_alignments_df):
	'''
	Use swissprot_top_alignments_df to curate all the unique UniProt IDs (ID.Isoform) that created top head and tail alignments
	'''
	swissprot_top_alignments_df['top_hg_full'] = swissprot_top_alignments_df['top_hg_UniProtID']+'.'+swissprot_top_alignments_df['top_hg_UniProt_isoform']
	swissprot_top_alignments_df['top_tg_full'] = swissprot_top_alignments_df['top_tg_UniProtID']+'.'+swissprot_top_alignments_df['top_tg_UniProt_isoform']

	unique_heads = swissprot_top_alignments_df.loc[
	swissprot_top_alignments_df['top_hg_UniProtID'].notna()
	]['top_hg_full'].unique().tolist()

	unique_tails = swissprot_top_alignments_df.loc[
	swissprot_top_alignments_df['top_tg_UniProtID'].notna()
	]['top_tg_full'].unique().tolist()

	unique_ht = set(unique_heads).union(set(unique_tails))
	unique_ht = list(unique_ht)
	unique_ht = [x for x in unique_ht if len(x)>1] # not just "."

	with open("blast_outputs/ht_uniprot_query.txt", "w") as f:
	for i, ht in enumerate(unique_ht):
	if i!= len(unique_ht)-1:
	f.write(f"{ht}\n")
	else:
	f.write(f"{ht}")

	def main():
	# Later, add the argparse thing back in here and change where the log is and what happens depending on wht the user decides
	# May need to separate blast prep from actual blast for the manuscript, but worry about this later
	with open_logfile(f"fusion_blast_log.txt"):
	# Start by preparing BLAST inputs
	prepare_blast_inputs()

	# Then run BLAST
	run_blast("blast_inputs",database="swissprot")

	###### Analyze BLAST results
	# Make database with head and tail info for each fusion, so we know what to expect
	fuson_ht_db = make_fuson_ht_db(savepath="fuson_ht_db.csv")

	#parse_all_blast_results(fuson_ht_db, database="swissprot")
	swissprot_blast_stats, swissprot_top_alignments_df = analyze_blast_results(fuson_ht_db,database="swissprot")

	swissprot_top_alignments_df = pd.read_csv("blast_outputs/swissprot_top_alignments.csv")
	get_ht_uniprot_query(swissprot_top_alignments_df)
	os.makedirs("figures/top_blast_visuals",exist_ok=True)
	group_pos_id_plot(swissprot_top_alignments_df)

	if __name__ == '__main__':
	main()