import pandas as pd import numpy as np import matplotlib.pyplot as plt from fuson_plm.utils.visualizing import set_font global pos_id_label_dict pos_id_label_dict = { 'top_UniProt_nIdentities': 'Identities', 'top_UniProt_nPositives': 'Positives' # Just makes it easier to label these on plots } def plot_pos_or_id_pcnt_hist(data, column_name, save_path=None, ax=None): """ column_name is Positives or Identities """ set_font() if ax is None: fig, ax = plt.subplots(figsize=(10, 7)) # Make the sample data data = data[['seq_id','aa_seq_len', column_name]].dropna() # only keep those with alignments data[column_name] = data[column_name]*100 # so it's % data[f"{column_name} Percent Coverage"] = data[column_name] / data['aa_seq_len'] # Save this sample data source_data_save_path = save_path.replace(".png","_source_data.csv") source_data = data[['seq_id',f"{column_name} Percent Coverage"]].sort_values(by=f"{column_name} Percent Coverage",ascending=True) source_data[f"{column_name} Percent Coverage"] = source_data[f"{column_name} Percent Coverage"].round(3) source_data.to_csv(source_data_save_path,index=False) # Calculate the mean and median of the percent coverage mean_coverage = data[f"{column_name} Percent Coverage"].mean() median_coverage = data[f"{column_name} Percent Coverage"].median() # Plot histogram for percent coverage ax.hist(data[f"{column_name} Percent Coverage"], bins=50, edgecolor='grey', alpha=0.8, color='mediumpurple') # Add vertical line for the mean ax.axvline(mean_coverage, color='black', linestyle='--', linewidth=2) # Add vertical line for the median ax.axvline(median_coverage, color='black', linestyle='-', linewidth=2) # Add text label for the mean line ax.text(mean_coverage, ax.get_ylim()[1] * 0.9, f'Mean: {mean_coverage:.1f}%', color='black', ha='center', va='top', fontsize=40, backgroundcolor='white') # Add text label for the median line ax.text(median_coverage, ax.get_ylim()[1] * 0.8, f'Median: {median_coverage:.1f}%', color='black', ha='center', va='top', fontsize=40, backgroundcolor='white') # Labels and title plt.xticks(fontsize=24) plt.yticks(fontsize=24) ax.set_xlabel(f"Max % {pos_id_label_dict[column_name]}", fontsize=40) ax.set_ylabel("Count", fontsize=40) #ax.set_title(f"{pos_id_label_dict[column_name]} Percent Coverage (n={len(data):,})", fontsize=40) plt.tight_layout() # Save the plot if save_path is not None: plt.savefig(save_path, dpi=300) # Show the plot if no ax is provided if ax is None: plt.show() def group_pos_id_plot(data): set_font() plot_pos_or_id_pcnt_hist(data, 'top_UniProt_nIdentities', save_path=f"figures/identities_hist.png", ax=None) def main(): swissprot_top_alignments_df = pd.read_csv("blast_outputs/swissprot_top_alignments.csv") plot_pos_or_id_pcnt_hist(swissprot_top_alignments_df, 'top_UniProt_nIdentities', save_path=f"figures/identities_hist.png", ax=None) if __name__ == '__main__': main()