File size: 3,226 Bytes

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from fuson_plm.utils.visualizing import set_font

global pos_id_label_dict 
pos_id_label_dict = {
    'top_UniProt_nIdentities': 'Identities',
    'top_UniProt_nPositives': 'Positives'   # Just makes it easier to label these on plots
}       

def plot_pos_or_id_pcnt_hist(data, column_name, save_path=None, ax=None):
    """
    column_name is Positives or Identities
    """
    set_font()
    
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 7))
       
    # Make the sample data 
    data = data[['seq_id','aa_seq_len', column_name]].dropna()  # only keep those with alignments
    data[column_name] = data[column_name]*100 # so it's % 
    data[f"{column_name} Percent Coverage"] = data[column_name] / data['aa_seq_len']
    
    # Save this sample data 
    source_data_save_path = save_path.replace(".png","_source_data.csv")
    source_data = data[['seq_id',f"{column_name} Percent Coverage"]].sort_values(by=f"{column_name} Percent Coverage",ascending=True)
    source_data[f"{column_name} Percent Coverage"] = source_data[f"{column_name} Percent Coverage"].round(3)
    source_data.to_csv(source_data_save_path,index=False)
    
    # Calculate the mean and median of the percent coverage
    mean_coverage = data[f"{column_name} Percent Coverage"].mean()
    median_coverage = data[f"{column_name} Percent Coverage"].median()

    # Plot histogram for percent coverage
    ax.hist(data[f"{column_name} Percent Coverage"], bins=50, edgecolor='grey', alpha=0.8, color='mediumpurple')

    # Add vertical line for the mean
    ax.axvline(mean_coverage, color='black', linestyle='--', linewidth=2)
    
    # Add vertical line for the median
    ax.axvline(median_coverage, color='black', linestyle='-', linewidth=2)

    # Add text label for the mean line
    ax.text(mean_coverage, ax.get_ylim()[1] * 0.9, f'Mean: {mean_coverage:.1f}%', color='black', 
            ha='center', va='top', fontsize=40, backgroundcolor='white')

    # Add text label for the median line
    ax.text(median_coverage, ax.get_ylim()[1] * 0.8, f'Median: {median_coverage:.1f}%', color='black', 
            ha='center', va='top', fontsize=40, backgroundcolor='white')

    # Labels and title
    plt.xticks(fontsize=24)
    plt.yticks(fontsize=24)
    ax.set_xlabel(f"Max % {pos_id_label_dict[column_name]}", fontsize=40)
    ax.set_ylabel("Count", fontsize=40)
    #ax.set_title(f"{pos_id_label_dict[column_name]} Percent Coverage (n={len(data):,})", fontsize=40)
    
    plt.tight_layout()

    # Save the plot
    if save_path is not None:
        plt.savefig(save_path, dpi=300)
    
    # Show the plot if no ax is provided
    if ax is None:
        plt.show()
        
def group_pos_id_plot(data):
    set_font()
    
    plot_pos_or_id_pcnt_hist(data, 'top_UniProt_nIdentities', save_path=f"figures/identities_hist.png", ax=None)
    
def main():
    swissprot_top_alignments_df = pd.read_csv("blast_outputs/swissprot_top_alignments.csv")
    plot_pos_or_id_pcnt_hist(swissprot_top_alignments_df, 
                             'top_UniProt_nIdentities', save_path=f"figures/identities_hist.png", ax=None)

if __name__ == '__main__':
    main()