root
data cleaning, blast, and splitting code with source data, also deleting unnecessary files
6efd653
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from fuson_plm.utils.visualizing import set_font | |
global pos_id_label_dict | |
pos_id_label_dict = { | |
'top_UniProt_nIdentities': 'Identities', | |
'top_UniProt_nPositives': 'Positives' # Just makes it easier to label these on plots | |
} | |
def plot_pos_or_id_pcnt_hist(data, column_name, save_path=None, ax=None): | |
""" | |
column_name is Positives or Identities | |
""" | |
set_font() | |
if ax is None: | |
fig, ax = plt.subplots(figsize=(10, 7)) | |
# Make the sample data | |
data = data[['seq_id','aa_seq_len', column_name]].dropna() # only keep those with alignments | |
data[column_name] = data[column_name]*100 # so it's % | |
data[f"{column_name} Percent Coverage"] = data[column_name] / data['aa_seq_len'] | |
# Save this sample data | |
source_data_save_path = save_path.replace(".png","_source_data.csv") | |
source_data = data[['seq_id',f"{column_name} Percent Coverage"]].sort_values(by=f"{column_name} Percent Coverage",ascending=True) | |
source_data[f"{column_name} Percent Coverage"] = source_data[f"{column_name} Percent Coverage"].round(3) | |
source_data.to_csv(source_data_save_path,index=False) | |
# Calculate the mean and median of the percent coverage | |
mean_coverage = data[f"{column_name} Percent Coverage"].mean() | |
median_coverage = data[f"{column_name} Percent Coverage"].median() | |
# Plot histogram for percent coverage | |
ax.hist(data[f"{column_name} Percent Coverage"], bins=50, edgecolor='grey', alpha=0.8, color='mediumpurple') | |
# Add vertical line for the mean | |
ax.axvline(mean_coverage, color='black', linestyle='--', linewidth=2) | |
# Add vertical line for the median | |
ax.axvline(median_coverage, color='black', linestyle='-', linewidth=2) | |
# Add text label for the mean line | |
ax.text(mean_coverage, ax.get_ylim()[1] * 0.9, f'Mean: {mean_coverage:.1f}%', color='black', | |
ha='center', va='top', fontsize=40, backgroundcolor='white') | |
# Add text label for the median line | |
ax.text(median_coverage, ax.get_ylim()[1] * 0.8, f'Median: {median_coverage:.1f}%', color='black', | |
ha='center', va='top', fontsize=40, backgroundcolor='white') | |
# Labels and title | |
plt.xticks(fontsize=24) | |
plt.yticks(fontsize=24) | |
ax.set_xlabel(f"Max % {pos_id_label_dict[column_name]}", fontsize=40) | |
ax.set_ylabel("Count", fontsize=40) | |
#ax.set_title(f"{pos_id_label_dict[column_name]} Percent Coverage (n={len(data):,})", fontsize=40) | |
plt.tight_layout() | |
# Save the plot | |
if save_path is not None: | |
plt.savefig(save_path, dpi=300) | |
# Show the plot if no ax is provided | |
if ax is None: | |
plt.show() | |
def group_pos_id_plot(data): | |
set_font() | |
plot_pos_or_id_pcnt_hist(data, 'top_UniProt_nIdentities', save_path=f"figures/identities_hist.png", ax=None) | |
def main(): | |
swissprot_top_alignments_df = pd.read_csv("blast_outputs/swissprot_top_alignments.csv") | |
plot_pos_or_id_pcnt_hist(swissprot_top_alignments_df, | |
'top_UniProt_nIdentities', save_path=f"figures/identities_hist.png", ax=None) | |
if __name__ == '__main__': | |
main() | |