File size: 44,518 Bytes

1e6a1f0

### Prepare to BLAST all of our sequences against UniProt
import pandas as pd
import os
import subprocess
import time
import re
import pickle
import numpy as np

from fuson_plm.utils.logging import log_update, open_logfile
from fuson_plm.utils.embedding import redump_pickle_dictionary
from fuson_plm.data.blast.plot import group_difference_plot, group_swiss_and_ht_plot, group_box_plot, group_pos_id_plot

def prepare_blast_inputs():
    log_update("\nPreparing BLAST Inputs. Logging every 1000 sequences... ")
    # make directory for input and output 
    os.makedirs("blast_inputs", exist_ok=True)
    
    # read the fuson database
    fuson_db = pd.read_csv('../fuson_db.csv')
    
    # make dictionary mapping sequences to seqids (for naming input filess)
    fuson_db_dict = dict(zip(fuson_db['aa_seq'],fuson_db['seq_id']))
    
    # convert the database into fasta format
    new_fa_files_created = 0
    old_fa_files_found = 0
    total_seqs_processed=0
    for i, (seq, seqid) in enumerate(fuson_db_dict.items()):
        total_seqs_processed+=1
        # if the path already exists, skip
        if os.path.exists(f"blast_inputs/{seqid}.fa"):
            old_fa_files_found+=1
        else:
            new_fa_files_created+=1
            with open(f"blast_inputs/{seqid}.txt", 'w') as f:
                fasta_lines = '>' + seqid + '\n' + seq
                f.write(fasta_lines)
            # rename it to .fa
            os.rename(f"blast_inputs/{seqid}.txt", f"blast_inputs/{seqid}.fa")

        if i%1000==0:
            log_update(f"\t\t{i}\t{seqid}:{seq}")
    
    log_update("\tFinished preparing BLAST Inputs (results in blast_inputs folder)")
    log_update(f"\t\tSequences processed: {total_seqs_processed}/{len(fuson_db)} seqs in FusOn-DB\n\t\tFasta files found: {old_fa_files_found}\n\t\tNew fasta files created: {new_fa_files_created}")

def run_blast(blast_inputs_dir, database="swissprot",n=1,interval=2000):
    """
    Run BLAST on all files in blast_inputs_dir
    """  
    # Must change the PATH variable to include the BLAST executables 
    os.environ['PATH'] += ":./ncbi-blast-2.16.0+/bin"   
    os.environ['BLASTDB'] = f"ncbi-blast-2.16.0+/{database}"
    
    # make directory for outputs
    os.makedirs("blast_outputs", exist_ok=True)
    os.makedirs(f"blast_outputs/{database}", exist_ok=True)
    already_blasted = os.listdir(f"blast_outputs/{database}")
    blast_input_files = os.listdir(blast_inputs_dir)
    # Sort the list using a custom key to extract the numeric part
    blast_input_files = sorted(blast_input_files, key=lambda x: int(re.search(r'\d+', x).group()))
    
    # print how many we've already blasted
    log_update(f"Running BLAST.\n\t{len(blast_input_files)} input files\n\t{len(already_blasted)} already blasted\n") 
    
    tot_seqs_processed = 0
    total_blast_time = 0
    
    start_i = interval*(n-1)
    end_i = interval*n
    if end_i>len(blast_input_files): end_i = len(blast_input_files)
    for i, blast_input_file in enumerate(blast_input_files[start_i:end_i]):
        tot_seqs_processed+=1
        # blast_input_file is of the format seqid.fa
        seqid = blast_input_file.split('.fa')[0]
        input_path = f"blast_inputs/{blast_input_file}"
        output_path = f"blast_outputs/{database}/{seqid}_{database}_results.out"
        
        if os.path.exists(output_path):
            log_update(f"\t{i+1}.\tAlready blasted {seqid}")
            continue
        
        # Construct the command as a list of arguments
        command = [
            "ncbi-blast-2.16.0+/bin/blastp",
            "-db", database,
            "-query", input_path,
            "-out", output_path
        ]

        # Run the command, and time it
        blast_start_time = time.time()
        result = subprocess.run(command, capture_output=True, text=True)
        blast_end_time = time.time()
        blast_seq_time = blast_end_time-blast_start_time
        total_blast_time+=blast_seq_time

        # Check if there was an error
        if result.returncode != 0:
            log_update(f"\t{i+1}.\tError running BLAST for {seqid}: {result.stderr} ({blast_seq_time:.2f}s)")
        else:
            log_update(f"\t{i+1}.\tBLAST search completed for {seqid} ({blast_seq_time:.2f}s)")
    
    log_update(f"\tFinished processing {tot_seqs_processed} sequences ({total_blast_time:.2f}s)")
 
def remove_incomplete_blasts(database="swissprot"):
    incomplete_list = []
    for fname in os.listdir(f"blast_outputs/{database}"):
        complete=False
        with open(f"blast_outputs/{database}/{fname}", "r") as f:
            lines = f.readlines()
            if len(lines)>1 and "Window for multiple hits:" in lines[-1]:
                complete=True
            if not complete:
                incomplete_list.append(fname)

    log_update(f"\t{len(incomplete_list)} BLAST files are incomplete (due to BLAST errors). Deleting them. Rerun these")
    # remove all these files
    for fname in incomplete_list:
        os.remove(f"blast_outputs/{database}/{fname}")
        
def find_nomatch_blasts(fuson_ht_db, database="swissprot"):
    no_match_list = []
    for fname in os.listdir(f"blast_outputs/{database}"):
        match=True
        with open(f"blast_outputs/{database}/{fname}", "r") as f:
            lines = f.readlines()
            if len(lines)>1 and "No hits found" in lines[28]:   # it'll say no hits found if there are no hits
                match=False
            if not match:
                no_match_list.append(fname)

    log_update(f"\t{len(no_match_list)} sequence IDs had no match in the BLAST database {database}")
    # write no match list to a file in blast_outputs
    with open(f"blast_outputs/{database}_no_match.txt","w") as f:
        for i, fname in enumerate(no_match_list):
            if i!=len(no_match_list)-1:
                f.write(f"{fname}\n")
            else:
                f.write(f"{fname}")
    
    # write a subset of fuson_ht_db containing these sequences as well 
    no_match_ids = [x.split('_')[0] for x in no_match_list]
    subset = fuson_ht_db.loc[
        fuson_ht_db['seq_id'].isin(no_match_ids)
    ].reset_index(drop=True)
    subset.to_csv(f"blast_outputs/{database}_no_match.csv",index=False)
    
    return no_match_ids
    
def make_fuson_ht_db(path_to_fuson_db="../fuson_db.csv", path_to_unimap="../head_tail_data/htgenes_uniprotids.csv",savepath="fuson_ht_db.csv"):
    """
    Make a version of the fuson_db that has all the heads and tails for each of the genes. Will make it easier to analyze blast results
    """
    if os.path.exists(savepath):
        df = pd.read_csv(savepath)
        return df
    
    # read both of teh databases
    fuson_db = pd.read_csv(path_to_fuson_db)
    ht_db = pd.read_csv(path_to_unimap)
    
    # Make it such that each row of fuson_db just has ONE head and ONE tail
    fuson_ht_db = fuson_db.copy(deep=True)
    fuson_ht_db['fusiongenes'] = fuson_ht_db['fusiongenes'].apply(lambda x: x.split(','))
    fuson_ht_db = fuson_ht_db.explode('fusiongenes')
    fuson_ht_db['hgene'] = fuson_ht_db['fusiongenes'].str.split('::',expand=True)[0]
    fuson_ht_db['tgene'] = fuson_ht_db['fusiongenes'].str.split('::',expand=True)[1]

    # Merge on head, then merge on tail
    fuson_ht_db = pd.merge(             # merge on head
        fuson_ht_db,
        ht_db.rename(columns={
            'Gene': 'hgene',
            'UniProtID': 'hgUniProt',
            'Reviewed': 'hgUniProtReviewed'
        }),
        on='hgene',
        how='left'
    )
    fuson_ht_db = pd.merge(             # merge on tail
        fuson_ht_db,
        ht_db.rename(columns={
            'Gene': 'tgene',
            'UniProtID': 'tgUniProt',
            'Reviewed': 'tgUniProtReviewed'
        }),
        on='tgene',
        how='left'
    )
    
    # Make sure we haven't lost anything
    tot_og_seqids = len(fuson_db['seq_id'].unique())
    tot_final_seqids = len(fuson_ht_db['seq_id'].unique())
    log_update(f"\tTotal sequence IDs in combined database = {tot_final_seqids}. Matches expected: {tot_final_seqids==tot_og_seqids}")
    # Each fusion should have the same number of ROWS as it does commas+1
    fuson_db['n_commas'] = fuson_db['fusiongenes'].str.count(',') + 1
    seqid_rows_map = dict(zip(fuson_db['seq_id'],fuson_db['n_commas']))
    vc = fuson_ht_db['seq_id'].value_counts().reset_index()
    vc['expected_count'] = vc['index'].map(seqid_rows_map)
    log_update(f"\tEach seq_id has the expected number of head-tail combos: {(vc['expected_count']==vc['seq_id']).all()}")
    
    log_update(f"\tPreview of combined database:")
    prev = fuson_ht_db.head(10)
    prev['aa_seq'] = prev['aa_seq'].apply(lambda x: x[0:10]+'...')
    log_update(prev.to_string(index=False))
    fuson_ht_db.to_csv(savepath, index=False)
    return fuson_ht_db

def format_dict(d, indent=0):
    """
    Recursively formats a dictionary for display purposes.
    
    Args:
        d (dict): The dictionary to format.
        indent (int): The current level of indentation.
    
    Returns:
        str: A formatted string representing the dictionary.
    """
    formatted_str = ""
    # Iterate through each key-value pair in the dictionary
    for key, value in d.items():
        # Create the current indentation
        current_indent = " " * (indent * 4)
        # Add the key
        formatted_str += f"{current_indent}{repr(key)}: "
        
        # Check the type of the value
        if isinstance(value, dict):
            # If dictionary, call format_dict recursively
            formatted_str += "{\n" + format_dict(value, indent + 1) + current_indent + "},\n"
        elif isinstance(value, list):
            # If list, convert it to a formatted string
            formatted_str += f"[{', '.join(repr(item) for item in value)}],\n"
        elif isinstance(value, str):
            # If string, enclose in quotes
            formatted_str += f"'{value}',\n"
        elif value is None:
            # If None, display as 'None'
            formatted_str += "None,\n"
        else:
            formatted_str += f"{repr(value)},\n"
    
    return formatted_str

def parse_blast_output(file_path, head_ids, tail_ids):
    """
    Args:
        - file_path: /path/to/blast/output
        - head_ids: list of all UniProt IDs for the head protien
        - tail_ids: list of all UniProt IDs for the tail protein
    """
    target_ids = list(set(head_ids + tail_ids))    # make a list to make some functions easier
    with open(file_path, 'r') as file:
        best_data = {tid: None for tid in target_ids}   # stores the best alignment for each ID we care about
        current_data = {tid: {} for tid in target_ids}  # stores the current data for each ID we care about (most recent alignment we read)
        best_score = {tid: -float('inf') for tid in target_ids} # stores the best score for each ID we care about
        capture = {tid: False for tid in target_ids}    # whether we are currently processing this ID
        replace_best = {tid: False for tid in target_ids}   # whether we should replace the best_data with the current_data for this ID
        isoform_dict = {tid: None for tid in target_ids}    # dictionary of isoforms for 
        
        # variables that will only be used for getting the best alignment
        alignment_count = 0
        cur_id = None
        on_best_alignment=False

        # Iterate through lines
        for line in file:
            line = line.strip()
            # if NEW ID (not necessarily new alignment! can be multiple alignmetns under one >)
            if line.startswith('>'):
                found_tid_in_header=False   # assume we have not found a target ID we are looking for
                alignment_count+=1
                if alignment_count==1:  # we're on the best alignment because this is the one that's listed first! it should be
                    on_best_alignment=True
                else:
                    on_best_alignment = False
                    
                ## We may have just finisehd processing an ID. Check for the one who currently has capture set to true
                just_captured = None
                total_captured = 0
                for k, v in capture.items():
                    if v:
                        total_captured+=1
                        just_captured = k
                # we should never be capturing more than one thing at a time. make sure of this
                assert total_captured<2
                if just_captured is not None:
                    if replace_best[just_captured]:   # if we just finished an alignment for the just_captured ID, and it's the best one, put it in
                        best_data[just_captured] = current_data[just_captured].copy()
                        replace_best[just_captured] = False     # we just did the replacement, so reset it 
                    
                # Check if the line contains any of the target IDs. 
                # This means EITHER [UniProtID] or [UniProtID.Isoform] or [UniProtID-Isoform] is in the line
                for tid in target_ids:
                    pattern = fr">{tid}([.-]\d+)? "    # for ID P02671, would match ">P02671 ", ">P02671.2 " and ">P02671-2 "
                    if re.search(pattern, line):    # if this ID matches
                        isoform_dict[tid] = None    # set it to None, update it if we need to 
                        if "." in line: # look for isoform denoted by . if there is one, otherwise it'll stay as None
                            isoform = int(line.split(".")[1].split(" ")[0])
                            isoform_dict[tid] = isoform
                            #print(f"\t\tID = {tid} (is a head or tail), isoform={isoform}")
                        elif "-" in line: # look for isoform denoted by - if there is one, otherwise it'll stay as None
                            isoform = int(line.split("-")[1].split(" ")[0])
                            isoform_dict[tid] = isoform
                            #print(f"\t\tID = {tid} (is a head or tail), isoform={isoform}")
                        capture[tid] = True
                        current_data[tid] = {'header': line}
                        found_tid_in_header=True   # we've found the tid that's in this line, so no need to check theothers 
                    else:
                        capture[tid] = False
                
                if on_best_alignment:   # if this is the best alignment
                    if not(found_tid_in_header):    # if none of our TIDs are it
                        cur_id_full = line.split('>')[1].split(' ')[0]
                        cur_id, isoform = cur_id_full, None
                        isoform_dict[cur_id] = None # change this if we need
                        if "." in cur_id_full:  # if there's a dot, it's an isoform. 
                            cur_id = cur_id_full.split(".")[0]
                            isoform = int(cur_id_full.split(".")[1])
                            isoform_dict[cur_id] = isoform
                            #log_update(f"\t\tID = {cur_id} (best alignment, not a head or tail), isoform={isoform}")
                            #log_update(f"\t\t\tFull line: {line}")  # so we can see the gene name. does it make sense? 
                        elif "-" in cur_id_full:  # if there's a -, it's an isoform. 
                            cur_id = cur_id_full.split("-")[0]
                            isoform = int(cur_id_full.split("-")[1])
                            isoform_dict[cur_id] = isoform
                            #log_update(f"\t\tID = {cur_id} (best alignment, not a head or tail), isoform={isoform}")
                            #log_update(f"\t\t\tFull line: {line}")  # so we can see the gene name. does it make sense? 
                        # add this id to all the dictionaries
                        best_data[cur_id] = None
                        current_data[cur_id] = {}
                        best_score[cur_id] = -float('inf')
                        capture[cur_id] = False
                        replace_best[cur_id] = False
                        
                            
            for tid in target_ids:
                if capture[tid]:    # if we're currently on an alignment for a tid we care about
                    if 'Score =' in line:
                        if replace_best[tid]:   # if we're replacing the best alignment with this one, within the same ID, do it 
                            best_data[tid] = current_data[tid].copy()
                            # now reset the variable! 
                            replace_best[tid] = False

                        score_value = float(line.split()[2])  # Assuming "Score = 1053 bits (2723)" format
                        current_data[tid] = {}  # Reset current_data for this ID
                        current_data[tid]['Isoform'] = isoform_dict[tid]
                        current_data[tid]['Score'] = score_value
                        current_data[tid]['Expect'] = line.split('Expect =')[1].split(', Method')[0].strip()
                        current_data[tid]['Query_Aligned'] = []
                        current_data[tid]['Subject_Aligned'] = []
                        # Set the ID as a head or tail, or neither (neither shouldn't happen here though)
                        if tid in head_ids:
                            current_data[tid]['H_or_T'] = 'Head' 
                            if tid in tail_ids:
                                current_data[tid]['H_or_T'] = 'Head,Tail'
                        elif tid in tail_ids:
                            current_data[tid]['H_or_T'] = 'Tail'
                        else:
                            current_data[tid]['H_or_T'] = np.nan
                            
                        current_data[tid]['Best'] = True if on_best_alignment else False
                        if score_value > best_score[tid]:   # if this is the best score we have for an alignment of this protein
                            best_score[tid] = score_value
                            replace_best[tid] = True
                        else:
                            replace_best[tid] = False

                    if 'Identities =' in line:
                        idents = line.split(', ')
                        current_data[tid]['Identities'] = idents[0].split('=')[1].strip()
                        current_data[tid]['Positives'] = idents[1].split('=')[1].strip()
                        current_data[tid]['Gaps'] = idents[2].split('=')[1].strip()
                    if line.startswith('Query'):
                        parts = line.split()
                        if 'Query_Start' not in current_data[tid]:
                            current_data[tid]['Query_Start'] = int(parts[1])
                        current_data[tid]['Query_End'] = int(parts[3])
                        current_data[tid]['Query_Aligned'].append(parts[2])
                    if line.startswith('Sbjct'):
                        parts = line.split()
                        if 'Sbjct_Start' not in current_data[tid]:
                            current_data[tid]['Sbjct_Start'] = int(parts[1])
                        current_data[tid]['Sbjct_End'] = int(parts[3])
                        current_data[tid]['Subject_Aligned'].append(parts[2])
                        
            # if we're on the best alignment and it's not one of our target_ids, still process it the same way
            if on_best_alignment:
                if not(found_tid_in_header):
                    if 'Score =' in line:
                        if replace_best[cur_id]:   # if we're replacing the best alignment with this one, within the same ID, do it 
                            best_data[cur_id] = current_data[cur_id].copy()
                            # now reset the variable! 
                            replace_best[cur_id] = False
                            
                        score_value = float(line.split()[2])  # Assuming "Score = 1053 bits (2723)" format
                        current_data[cur_id] = {}  # Reset current_data for this ID
                        current_data[cur_id]['Isoform'] = isoform_dict[cur_id]
                        current_data[cur_id]['Score'] = score_value
                        current_data[cur_id]['Expect'] = line.split('Expect =')[1].split(', Method')[0].strip()
                        current_data[cur_id]['Query_Aligned'] = []
                        current_data[cur_id]['Subject_Aligned'] = []
                        # Set the ID as a head or tail, or neither
                        if cur_id in head_ids:
                            current_data[cur_id]['H_or_T'] = 'Head' 
                            if cur_id in tail_ids:
                                current_data[cur_id]['H_or_T'] = 'Head,Tail'
                        elif cur_id in tail_ids:
                            current_data[cur_id]['H_or_T'] = 'Tail'
                        else:
                            current_data[cur_id]['H_or_T'] = np.nan
                            
                        current_data[cur_id]['Best'] = True
                        if score_value > best_score[cur_id]:   # if this is the best score we have for an alignment of this protein
                            best_score[cur_id] = score_value
                            replace_best[cur_id] = True
                        else:
                            replace_best[cur_id] = False

                    if 'Identities =' in line:
                        idents = line.split(', ')
                        current_data[cur_id]['Identities'] = idents[0].split('=')[1].strip()
                        current_data[cur_id]['Positives'] = idents[1].split('=')[1].strip()
                        current_data[cur_id]['Gaps'] = idents[2].split('=')[1].strip()
                    if line.startswith('Query'):
                        parts = line.split()
                        if 'Query_Start' not in current_data[cur_id]:
                            current_data[cur_id]['Query_Start'] = int(parts[1])
                        current_data[cur_id]['Query_End'] = int(parts[3])
                        current_data[cur_id]['Query_Aligned'].append(parts[2])
                    if line.startswith('Sbjct'):
                        parts = line.split()
                        if 'Sbjct_Start' not in current_data[cur_id]:
                            current_data[cur_id]['Sbjct_Start'] = int(parts[1])
                        current_data[cur_id]['Sbjct_End'] = int(parts[3])
                        current_data[cur_id]['Subject_Aligned'].append(parts[2])
                    
        # add cur_id to target_ids if it's not none
        if not(cur_id is None):
            target_ids += [cur_id]
            
        # Check at the end of the file if the last scores are the best 
        for tid in target_ids:
            if replace_best[tid]:
                best_data[tid] = current_data[tid].copy()

        # Combine sequences into single strings for the best data for each ID
        for tid in target_ids:
            #print(tid)
            if best_data[tid]:
                #print(f"there is a best alignment for {tid}")
                #print(f"best: {best_data[tid]}")
                #print(f"current: {current_data[tid]}")
                best_data[tid]['Query_Aligned'] = ''.join(best_data[tid]['Query_Aligned'])
                best_data[tid]['Subject_Aligned'] = ''.join(best_data[tid]['Subject_Aligned'])

    return best_data

def parse_all_blast_results(fuson_ht_db, database="swissprot"):
    """
    Analyze the BLAST outputs for each fusion protein against UniProt. 
    Use the fuson_ht_db to look for the heads and tails that we expect. If they can't be found, ... ? 
    """
    output_file=f"blast_outputs/{database}_blast_output_analyzed.pkl"
    all_seq_ids = fuson_ht_db['seq_id'].unique().tolist()
    all_seq_ids = sorted(all_seq_ids, key=lambda x: int(re.search(r'\d+', x).group()))  # sort by the number. seq1, seq2, ...
    
    prior_results = {}
    if os.path.exists(output_file):
        with open(output_file, "rb") as f:
            prior_results = pickle.load(f)
    
    # Iterate through seq_ids 
    total_parse_time = 0
    tot_seqs_processed = 0
    for seq_id in all_seq_ids:
        try: 
            tot_seqs_processed+=1
            # If we've already processed it, skip 
            if seq_id in prior_results: 
                log_update(f"\tAlready processed {seq_id} blast results. Continuing")
                continue
            
            file_path = f"blast_outputs/{database}/{seq_id}_{database}_results.out"
            
            aa_seq = fuson_ht_db.loc[
                fuson_ht_db['seq_id']==seq_id
            ]['aa_seq'].tolist()[0]
            
            # Remember, fuson_ht_db has all the IDs for ALL the different head and tail gene identifiers. 
            fusion_genes = fuson_ht_db.loc[
                fuson_ht_db['seq_id']==seq_id
            ]['fusiongenes'].tolist()
            
            ##### Process heads
            head_ids = fuson_ht_db.loc[
                fuson_ht_db['seq_id']==seq_id
            ]['hgUniProt'].dropna().tolist()
            head_reviewed, head_reviewed_dict = "", {}
            if len(head_ids)>0: # if we found head IDs, we can process them and figure out if they're reviewed
                head_ids = ",".join(head_ids).split(",")
                head_reviewed = fuson_ht_db.loc[
                    fuson_ht_db['seq_id']==seq_id
                ]['hgUniProtReviewed'].dropna().tolist()
                head_reviewed = list("".join(head_reviewed))
                
                head_reviewed_dict = dict(zip(head_ids, head_reviewed))
                head_ids = list(head_reviewed_dict.keys())      # there may be some duplicates, so separate them out again
                head_reviewed = list(head_reviewed_dict.values())
            
            head_genes = fuson_ht_db.loc[
                fuson_ht_db['seq_id']==seq_id
            ]['hgene'].unique().tolist()
            
            ##### Process tails - same logic
            tail_ids = fuson_ht_db.loc[
                fuson_ht_db['seq_id']==seq_id
            ]['tgUniProt'].dropna().tolist()
            tail_reviewed, tail_reviewed_dict = "", {}
            if len(tail_ids)>0: # if we found tail IDs, we can process them and figure out if they're reviewed
                tail_ids = ",".join(tail_ids).split(",")
                tail_reviewed = fuson_ht_db.loc[
                    fuson_ht_db['seq_id']==seq_id
                ]['tgUniProtReviewed'].dropna().tolist()
                tail_reviewed = list("".join(tail_reviewed))
                
                tail_reviewed_dict = dict(zip(tail_ids, tail_reviewed))
                tail_ids = list(tail_reviewed_dict.keys())      # there may be some duplicates, so separate them out again
                tail_reviewed = list(tail_reviewed_dict.values())
            
            tail_genes = fuson_ht_db.loc[
                fuson_ht_db['seq_id']==seq_id
            ]['tgene'].unique().tolist()
            
            ###### Log what we just found
            log_update(f"\tEvaluating {seq_id}, fusion genes = {fusion_genes}, len = {len(aa_seq)}...\n\t\tfile_path={file_path}")
            #log_update(f"\n\t\thead genes={head_genes}\n\t\thead_ids={head_ids}\n\t\ttail genes={tail_genes}\n\t\ttail_ids={tail_ids}")
            
            ### Do the analysis and time it
            parse_start_time = time.time()       # time it 
            blast_data = parse_blast_output(file_path, head_ids, tail_ids)
            parse_end_time = time.time()
            parse_seq_time = parse_end_time-parse_start_time
            total_parse_time+=parse_seq_time
            log_update(f"\t\tBLAST output analysis completed for {seq_id} ({parse_seq_time:.2f}s)")
            
            # Give preview of results. Logging the whole dict would be too much, so let's just see what we found
            #log_update(format_dict(blast_data,indent=3))
            n_og_reviewed_head_ids = len([x for x in head_reviewed if x=='1'])
            found_head_ids = [x for x in list(blast_data.keys()) if (blast_data[x] is not None) and (blast_data[x].get('H_or_T',None) in ['Head','Head,Tail'])]
            n_found_reviewed_head_ids = len([x for x in found_head_ids if head_reviewed_dict[x]=='1'])
            
            n_og_reviewed_tail_ids = len([x for x in tail_reviewed if x=='1'])
            found_tail_ids = [x for x in list(blast_data.keys()) if (blast_data[x] is not None) and (blast_data[x].get('H_or_T',None) in ['Tail','Head,Tail'])]
            n_found_reviewed_tail_ids = len([x for x in found_tail_ids if tail_reviewed_dict[x]=='1'])
            
            #log_update(f"\t\t{len(found_head_ids)}/{len(head_ids)} head protein UniProt IDs ({n_found_reviewed_head_ids}/{n_og_reviewed_head_ids} REVIEWED heads) had alignments")
            #log_update(f"\t\t{len(found_tail_ids)}/{len(tail_ids)} tail protein UniProt IDs ({n_found_reviewed_tail_ids}/{n_og_reviewed_tail_ids} REVIEWED tails) had alignments")
        
            # write results to pickle file
            to_pickle_dict = {seq_id: blast_data}
            with open(output_file, 'ab+') as f:
                pickle.dump(to_pickle_dict, f)
    
        except:
            log_update(f"{seq_id} failed")
            # redump the pickle even if we hit an error, so that we can fix the error and continue processing results
            redump_pickle_dictionary(output_file)
            
    # Log total time
    log_update(f"\tFinished processing {tot_seqs_processed} sequences ({total_parse_time:.2f}s)")
    
    # redump the pickle
    redump_pickle_dictionary(output_file)

def analyze_blast_results(fuson_ht_db, database="swissprot"):
    blast_results_path=f"blast_outputs/{database}_blast_output_analyzed.pkl"
    stats_df_savepath = f"blast_outputs/{database}_blast_stats.csv"
    top_alignments_df_savepath = f"blast_outputs/{database}_top_alignments.csv"
    
    stats_df, top_alignments_df = None, None
    if os.path.exists(stats_df_savepath) and os.path.exists(top_alignments_df_savepath):
        stats_df = pd.read_csv(stats_df_savepath)
        top_alignments_df = pd.read_csv(top_alignments_df_savepath, dtype={'top_hg_UniProt_isoform':'str',
                                                                            'top_tg_UniProt_isoform': 'str',
                                                                            'top_UniProt_isoform': 'str'})
        
    else:
        with open(blast_results_path, "rb") as f:
            results = pickle.load(f)
            
        # analyze the results
        # first, basic stats. How many of them have at least one head or tail alignment??
        seqid_stats = {}
        top_alignments_dict = {}
        for seq_id in list(results.keys()):
            seqid_stats[seq_id] = {
                'hgAlignments': 0,
                'tgAlignments': 0,
                'totalAlignments': 0,
                'best_hgScore': 0,
                'best_tgScore': 0,
                'best_Score': 0
            }
            top_alignments_dict[seq_id] = {
                'top_hg_UniProtID': None,
                'top_hg_UniProt_isoform': None,
                'top_hg_UniProt_fus_indices': None,
                'top_tg_UniProtID': None,
                'top_tg_UniProt_isoform': None,
                'top_tg_UniProt_fus_indices': None,
                'top_UniProtID': None,
                'top_UniProt_isoform': None,
                'top_UniProt_fus_indices': None
            }
            for uniprot, d in results[seq_id].items():
                if not(d is None):
                    isoform = d['Isoform']
                    # set up the indices string
                    query_start = d['Query_Start']
                    if (query_start is None) or (type(query_start)==float and np.isnan(query_start)):
                        query_start = ''
                    else:
                        query_start = int(query_start)
                    query_end = d['Query_End']
                    if (query_end is None) or (type(query_end)==float and np.isnan(query_end)):
                        query_end = ''
                    else:
                        query_end = int(query_end)
                    fus_indices = f"{query_start},{query_end}".strip(",")
                    
                    if d['H_or_T'] in ['Head', 'Head,Tail']:
                        seqid_stats[seq_id]['hgAlignments'] +=1
                        if d['Score'] > seqid_stats[seq_id]['best_hgScore']:
                            seqid_stats[seq_id]['best_hgScore'] = d['Score']
                            if type(uniprot)==float or uniprot is None:
                                top_alignments_dict[seq_id]['top_hg_UniProtID'] = ''
                            else:
                                top_alignments_dict[seq_id]['top_hg_UniProtID'] = uniprot
                            if (type(isoform)==float and np.isnan(isoform)) or isoform is None:
                                top_alignments_dict[seq_id]['top_hg_UniProt_isoform'] = ''
                            else:
                                top_alignments_dict[seq_id]['top_hg_UniProt_isoform'] = str(int(isoform))
                            
                            top_alignments_dict[seq_id]['top_hg_UniProt_fus_indices'] = fus_indices
                            
                    if d['H_or_T'] in ['Tail','Head,Tail']:
                        seqid_stats[seq_id]['tgAlignments'] +=1
                        if d['Score'] > seqid_stats[seq_id]['best_tgScore']:
                            seqid_stats[seq_id]['best_tgScore'] = d['Score']
                            if type(uniprot)==float or uniprot is None:
                                top_alignments_dict[seq_id]['top_tg_UniProtID'] = ''
                            else:
                                top_alignments_dict[seq_id]['top_tg_UniProtID'] = uniprot
                            if (type(isoform)==float and np.isnan(isoform)) or isoform is None:
                                top_alignments_dict[seq_id]['top_tg_UniProt_isoform'] = ''
                            else:
                                top_alignments_dict[seq_id]['top_tg_UniProt_isoform'] = str(int(isoform))
                            
                            top_alignments_dict[seq_id]['top_tg_UniProt_fus_indices'] = fus_indices
                    # increment total no matter what type of alignment it is
                    seqid_stats[seq_id]['totalAlignments']+=1
                    #if d['Score'] > seqid_stats[seq_id]['best_Score']:
                    if d['Best']==True: # should be indicated if this is the best!!
                        seqid_stats[seq_id]['best_Score'] = d['Score']
                        if type(uniprot)==float or uniprot is None:
                            top_alignments_dict[seq_id]['top_UniProtID'] = ''
                        else:
                            top_alignments_dict[seq_id]['top_UniProtID'] = uniprot
                        if (type(isoform)==float and np.isnan(isoform)) or isoform is None:
                            top_alignments_dict[seq_id]['top_UniProt_isoform'] = ''
                        else:
                            top_alignments_dict[seq_id]['top_UniProt_isoform'] = str(int(isoform))
                        
                        top_alignments_dict[seq_id]['top_UniProt_fus_indices'] = fus_indices
                        # now get positives and identities
                        if 'Identities' not in d: print(seq_id, uniprot, d.keys())
                        identities = d['Identities']
                        identities = int(identities.split('/')[0])
                        positives = d['Positives']
                        positives = int(positives.split('/')[0])
                        top_alignments_dict[seq_id]['top_UniProt_nIdentities'] = identities
                        top_alignments_dict[seq_id]['top_UniProt_nPositives'] = positives
                        
        
        stats_df = pd.DataFrame.from_dict(seqid_stats, orient='index').reset_index().rename(columns={'index':'seq_id'})
        stats_df['h_or_t_alignment'] = stats_df.apply(lambda row: True if (row['hgAlignments']>0 or row['tgAlignments']>0) else False, axis=1)
        stats_df['h_and_t_alignment'] = stats_df.apply(lambda row: True if (row['hgAlignments']>0 and row['tgAlignments']>0) else False, axis=1)
        stats_df.to_csv(stats_df_savepath,index=False)
        
        top_alignments_df = pd.DataFrame.from_dict(top_alignments_dict, orient='index').reset_index().rename(columns={'index':'seq_id'})
        # add in the sequence length so we can get percentages
        fusion_id_seq_dict = dict(zip(fuson_ht_db['seq_id'],fuson_ht_db['aa_seq']))
        assert len(fusion_id_seq_dict) == len(fuson_ht_db['seq_id'].unique()) == len(fuson_ht_db['aa_seq'].unique())
        top_alignments_df['aa_seq_len'] = top_alignments_df['seq_id'].map(fusion_id_seq_dict).str.len()
        
        top_alignments_df.to_csv(top_alignments_df_savepath,index=False)
    # also, find which ones have no match at all 
    # does it match?
    no_match_list1 = find_nomatch_blasts(fuson_ht_db, database=database)
    
    log_update(stats_df.head(10).to_string())
    # how many have at least one head or tail?
    log_update(f"Total sequences: {len(stats_df)}")
    log_update(f"Sequences with >=1 head alignment: {len(stats_df.loc[stats_df['hgAlignments']>0])}")
    log_update(f"Sequences with >=1 tail alignment: {len(stats_df.loc[stats_df['tgAlignments']>0])}")
    log_update(f"Sequences with >=1 head OR tail alignment: {len(stats_df.loc[stats_df['h_or_t_alignment']])}")
    log_update(f"Sequences with >=1 head AND tail alignment: {len(stats_df.loc[stats_df['h_and_t_alignment']])}")
    log_update(f"Sequences with ANY alignment: {len(stats_df.loc[stats_df['totalAlignments']>0])}")
    
    top_alignments_df = top_alignments_df.replace({None: ''})
    log_update(f"Preview of top alignments for {database} search:\n{top_alignments_df.head(10).to_string(index=False)}")
    top_alignments_df['hiso'] = top_alignments_df['top_hg_UniProtID']+'-'+top_alignments_df['top_hg_UniProt_isoform']
    top_alignments_df['tiso'] = top_alignments_df['top_tg_UniProtID']+'-'+top_alignments_df['top_tg_UniProt_isoform']
    top_alignments_df['biso'] = top_alignments_df['top_UniProtID']+'-'+top_alignments_df['top_UniProt_isoform']
    top_hgs = set([x.strip('-') for x in top_alignments_df['hiso'].tolist()])   # if things don't have isoforms they'll just end in -
    top_tgs = set([x.strip('-') for x in top_alignments_df['tiso'].tolist()])
    top_bgs = set([x.strip('-') for x in top_alignments_df['biso'].tolist()])
    top_gs = top_hgs | top_tgs | top_bgs
    log_update(f"\nTotal unique head proteins (including isoform) producing top head alignments: {len(top_hgs)}")
    log_update(f"\nTotal unique tail proteins (including isoform) producing top tail alignments: {len(top_tgs)}")
    log_update(f"\nTotal unique proteins (including isoform) - head, tail, or neither - producing top alignments: {len(top_gs)}")
    
    
    
    return stats_df, top_alignments_df

def compare_database_blasts(fuson_ht_db, swissprot_blast_stats, fusion_hts_blast_stats, make_new_plots=True):
    # let's start by just returning a list of IDs that were 
    # cols = seq_id  hgAlignments  tgAlignments  totalAlignments  best_hgScore  best_tgScore  best_Score  h_or_t_alignment  h_and_t_alignment
    
    # distinguish the columns
    og_cols = list(swissprot_blast_stats.columns)[1::]
    for c in og_cols:
        if c!='seq_id':
            swissprot_blast_stats = swissprot_blast_stats.rename(columns={c: f"swiss_{c}"})
    for c in og_cols:
        if c!='seq_id':
            fusion_hts_blast_stats = fusion_hts_blast_stats.rename(columns={c: f"hts_{c}"})
            
    # merge
    merged = pd.merge(swissprot_blast_stats,
                      fusion_hts_blast_stats,
                      on='seq_id',
                      how='outer')
    diff_cols = og_cols[0:-2]
    differences = pd.DataFrame(columns=diff_cols)
    log_update(f"Making volcano plots of the differences between fusion head-tail BLAST and swissprot BLAST in the following columns:\n\t{','.join(diff_cols)}")
    for c in diff_cols:
        differences[c] = merged[f"hts_{c}"] - merged[f"swiss_{c}"]

    # make some box plots of differences 
    # Generate volcano plots for each column
    if make_new_plots:
        os.makedirs("figures",exist_ok=True)
        os.makedirs("figures/database_comparison",exist_ok=True)
        os.makedirs("figures/database_comparison/differences",exist_ok=True)
        os.makedirs("figures/database_comparison/values",exist_ok=True)
        os.makedirs("figures/database_comparison/box",exist_ok=True)
        
        group_difference_plot(differences)
        group_swiss_and_ht_plot(merged.drop(columns=['seq_id']), diff_cols)
        group_box_plot(merged.drop(columns=['seq_id']), diff_cols)
        
def fasta_to_dataframe(fasta_file):
    # Read the file into a DataFrame with a single column
    df = pd.read_fwf(fasta_file, header=None, colspecs=[(0, None)], names=['content'])

    # Select even and odd lines using pandas slicing
    ids = df.iloc[::2].reset_index(drop=True)  # Even-indexed lines (IDs)
    sequences = df.iloc[1::2].reset_index(drop=True)  # Odd-indexed lines (sequences)

    # Combine into a new DataFrame
    fasta_df = pd.DataFrame({'ID': ids['content'], 'Sequence': sequences['content']})
    fasta_df['ID'] = fasta_df['ID'].str.split('>',expand=True)[1]
    fasta_df['Sequence'] = fasta_df['Sequence'].str.strip().str.strip('\n')
    
    # print a preview of this 
    temp = fasta_df.head(10)
    temp['Sequence'] = temp['Sequence'].apply(lambda x: x[0:10]+'...')
    log_update(f"Preview of head/tail fasta sequences in a dataframe:\n{temp.to_string(index=False)}")
    
    return fasta_df
    
def get_ht_uniprot_query(swissprot_top_alignments_df):
    '''
    Use swissprot_top_alignments_df to curate all the unique UniProt IDs (ID.Isoform) that created top head and tail alignments
    '''
    swissprot_top_alignments_df['top_hg_full'] = swissprot_top_alignments_df['top_hg_UniProtID']+'.'+swissprot_top_alignments_df['top_hg_UniProt_isoform']
    swissprot_top_alignments_df['top_tg_full'] = swissprot_top_alignments_df['top_tg_UniProtID']+'.'+swissprot_top_alignments_df['top_tg_UniProt_isoform']
    
    unique_heads = swissprot_top_alignments_df.loc[
        swissprot_top_alignments_df['top_hg_UniProtID'].notna()
    ]['top_hg_full'].unique().tolist()

    unique_tails = swissprot_top_alignments_df.loc[
        swissprot_top_alignments_df['top_tg_UniProtID'].notna()
    ]['top_tg_full'].unique().tolist()
    
    unique_ht = set(unique_heads).union(set(unique_tails))
    unique_ht = list(unique_ht)
    unique_ht = [x for x in unique_ht if len(x)>1]    # not just "."

    with open("blast_outputs/ht_uniprot_query.txt", "w") as f:
        for i, ht in enumerate(unique_ht):
            if i!= len(unique_ht)-1:
                f.write(f"{ht}\n")
            else:
                f.write(f"{ht}")
                
def main():
    # Later, add the argparse thing back in here and change where the log is and what happens depending on wht the user decides
    # May need to separate blast prep from actual blast for the manuscript, but worry about this later
    with open_logfile(f"fusion_blast_log.txt"):
        # Start by preparing BLAST inputs
        prepare_blast_inputs()
    
        # Then run BLAST
        run_blast("blast_inputs",database="swissprot")
        
        ###### Analyze BLAST results
        # Make database with head and tail info for each fusion, so we know what to expect
        fuson_ht_db = make_fuson_ht_db(savepath="fuson_ht_db.csv")
        
        #parse_all_blast_results(fuson_ht_db, database="swissprot")
        swissprot_blast_stats, swissprot_top_alignments_df = analyze_blast_results(fuson_ht_db,database="swissprot")

        swissprot_top_alignments_df = pd.read_csv("blast_outputs/swissprot_top_alignments.csv")
        get_ht_uniprot_query(swissprot_top_alignments_df)
        os.makedirs("figures/top_blast_visuals",exist_ok=True)
        group_pos_id_plot(swissprot_top_alignments_df)
        
if __name__ == '__main__':
    main()