FusOn-pLM / fuson_plm /data /blast /plot.py

root

data cleaning, blast, and splitting code with source data, also deleting unnecessary files

6efd653 6 months ago

3.23 kB

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	from fuson_plm.utils.visualizing import set_font

	global pos_id_label_dict
	pos_id_label_dict = {
	'top_UniProt_nIdentities': 'Identities',
	'top_UniProt_nPositives': 'Positives' # Just makes it easier to label these on plots
	}

	def plot_pos_or_id_pcnt_hist(data, column_name, save_path=None, ax=None):
	"""
	column_name is Positives or Identities
	"""
	set_font()

	if ax is None:
	fig, ax = plt.subplots(figsize=(10, 7))

	# Make the sample data
	data = data[['seq_id','aa_seq_len', column_name]].dropna() # only keep those with alignments
	data[column_name] = data[column_name]*100 # so it's %
	data[f"{column_name} Percent Coverage"] = data[column_name] / data['aa_seq_len']

	# Save this sample data
	source_data_save_path = save_path.replace(".png","_source_data.csv")
	source_data = data[['seq_id',f"{column_name} Percent Coverage"]].sort_values(by=f"{column_name} Percent Coverage",ascending=True)
	source_data[f"{column_name} Percent Coverage"] = source_data[f"{column_name} Percent Coverage"].round(3)
	source_data.to_csv(source_data_save_path,index=False)

	# Calculate the mean and median of the percent coverage
	mean_coverage = data[f"{column_name} Percent Coverage"].mean()
	median_coverage = data[f"{column_name} Percent Coverage"].median()

	# Plot histogram for percent coverage
	ax.hist(data[f"{column_name} Percent Coverage"], bins=50, edgecolor='grey', alpha=0.8, color='mediumpurple')

	# Add vertical line for the mean
	ax.axvline(mean_coverage, color='black', linestyle='--', linewidth=2)

	# Add vertical line for the median
	ax.axvline(median_coverage, color='black', linestyle='-', linewidth=2)

	# Add text label for the mean line
	ax.text(mean_coverage, ax.get_ylim()[1] * 0.9, f'Mean: {mean_coverage:.1f}%', color='black',
	ha='center', va='top', fontsize=40, backgroundcolor='white')

	# Add text label for the median line
	ax.text(median_coverage, ax.get_ylim()[1] * 0.8, f'Median: {median_coverage:.1f}%', color='black',
	ha='center', va='top', fontsize=40, backgroundcolor='white')

	# Labels and title
	plt.xticks(fontsize=24)
	plt.yticks(fontsize=24)
	ax.set_xlabel(f"Max % {pos_id_label_dict[column_name]}", fontsize=40)
	ax.set_ylabel("Count", fontsize=40)
	#ax.set_title(f"{pos_id_label_dict[column_name]} Percent Coverage (n={len(data):,})", fontsize=40)

	plt.tight_layout()

	# Save the plot
	if save_path is not None:
	plt.savefig(save_path, dpi=300)

	# Show the plot if no ax is provided
	if ax is None:
	plt.show()

	def group_pos_id_plot(data):
	set_font()

	plot_pos_or_id_pcnt_hist(data, 'top_UniProt_nIdentities', save_path=f"figures/identities_hist.png", ax=None)

	def main():
	swissprot_top_alignments_df = pd.read_csv("blast_outputs/swissprot_top_alignments.csv")
	plot_pos_or_id_pcnt_hist(swissprot_top_alignments_df,
	'top_UniProt_nIdentities', save_path=f"figures/identities_hist.png", ax=None)

	if __name__ == '__main__':
	main()