Sophia Vincoff

fixed READMEs and added IDR Prediction benchmark

e048d40 6 months ago

17.7 kB

	import pandas as pd
	import numpy as np
	import os
	from fuson_plm.utils.logging import open_logfile, log_update
	from fuson_plm.utils.constants import DELIMITERS, VALID_AAS
	from fuson_plm.utils.data_cleaning import check_columns_for_listlike, find_invalid_chars
	from fuson_plm.benchmarking.idr_prediction.plot import plot_all_values_hist_grid, plot_all_train_val_test_values_hist_grid

	def process_raw_albatross(df):
	# return a version of the df with first column split, duplicates cleaned,columns checked for weird characters and invalids

	# first, look at the splits
	split_str = df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'})
	tot_prots = sum(split_str['count'])
	split_str['pcnt'] = round(100*split_str['count']/tot_prots,2)
	split_str = split_str.to_string(index=False)
	split_str = "\t\t" + split_str.replace("\n","\n\t\t")
	log_update(f"\tTotal proteins: {tot_prots}\n\tSplits:\n{split_str}")

	# format: IDR_19076_tr___A0A8M9PNM5___A0A8M9PNM5_DANRE
	# or: synth_test_sequence0
	df['temp'] = df['ID'].str.split("_")
	df['ID'] = df['temp'].apply(lambda x: f"{x[0]}" if len(x)==1 else f"{x[0]}_{x[1]}" if len(x)<3 else f"{x[0]}_{x[1]}_{x[2]}")
	# Not ever column has UniProt IDs and Names, so we have to allow np.nan if this info is missing.
	df['UniProt_ID'] = df['temp'].apply(lambda x: x[5].strip() if len(x)>=5 else np.nan)
	df['UniProt_Name'] = df['temp'].apply(lambda x: f"{x[8].strip()}_{x[9].strip()}" if len(x)>=8 else np.nan)
	df = df.drop(columns=['temp'])

	cols_to_check = list(df.columns)
	cols_to_check.remove('Value') # don't check this one because it shouldn't be string
	# Investigate the colimns we just created and make sure they don't have any invalid features.
	# make sure value is float type
	assert df['Value'].dtype == 'float64'
	check_columns_for_listlike(df, cols_of_interest=cols_to_check, delimiters=DELIMITERS)

	# Check for invalid AAs
	df['invalid_chars'] = df['Sequence'].apply(lambda x: find_invalid_chars(x, VALID_AAS))
	df[df['invalid_chars'].str.len()>0].sort_values(by='Sequence')
	all_invalid_chars = set().union(*df['invalid_chars'])
	log_update(f"\tchecking for invalid characters...\n\t\tset of all invalid characters discovered within train_df: {all_invalid_chars}")

	# Assert no invalid AAs
	assert (df['invalid_chars'].str.len()==0).all()
	df = df.drop(columns=['invalid_chars'])

	# Check for duplicates - if we find any, REMOVE them from train and keep them in test
	duplicates = df[df.duplicated('Sequence')]['Sequence'].unique().tolist()
	n_rows_with_duplicates = len(df[df['Sequence'].isin(duplicates)])
	log_update(f"\t{len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} rows")

	# Look for distribution of duplicates WITHIN train, WITHIN test, and BETWEEN train and test
	# Train only
	duplicates = df.loc[
	(df['Split']=='Train')
	]
	duplicates = duplicates[duplicates.duplicated('Sequence')]['Sequence'].unique().tolist()
	n_rows_with_duplicates = len(df.loc[
	(df['Sequence'].isin(duplicates)) &
	(df['Split']=='Train')
	])
	log_update(f"\t\twithin TRAIN only: {len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} Train rows")

	# Test only
	duplicates = df.loc[
	(df['Split']=='Test')
	]
	duplicates = duplicates[duplicates.duplicated('Sequence')]['Sequence'].unique().tolist()
	n_rows_with_duplicates = len(df.loc[
	(df['Sequence'].isin(duplicates)) &
	(df['Split']=='Test')
	])
	log_update(f"\t\twithin TEST only: {len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} Test rows")

	# Between train and test
	duplicates_df = df.groupby('Sequence').agg({
	'Split': lambda x: ','.join(set(x))
	}).reset_index()
	duplicates_df = duplicates_df.loc[duplicates_df['Split'].str.contains(',')].reset_index(drop=True)
	duplicates = duplicates_df['Sequence'].unique().tolist()
	n_rows_with_duplicates = len(df[df['Sequence'].isin(duplicates)])
	log_update(f"\t\tduplicates in BOTH TRAIN AND TEST: {len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} rows")
	log_update(f"\t\tprinting portion of dataframe with train+test shared seqs:\n{duplicates_df.head(5)}")

	log_update("\tGrouping by sequence, averaging values, and keeping any Train/Test duplicates in the Test set...")
	df = df.replace(np.nan, '')
	df = df.groupby('Sequence').agg(
	Value=('Value', 'mean'),
	Value_STD=('Value', 'std'),
	IDs=('ID', lambda x: ','.join(x)),
	UniProt_IDs=('UniProt_ID', lambda x: ','.join(x)),
	UniProt_Names=('UniProt_Name', lambda x: ','.join(x)),
	Split=('Split', lambda x: ','.join(x))
	).reset_index()
	for col in ['IDs','UniProt_IDs','UniProt_Names','Split']:
	df[col] = df[col].apply(lambda x: [y for y in x.split(',')])
	df[col] = df[col].apply(lambda x: ','.join(x))
	df[col] = df[col].str.strip(',')
	# make sure there are no commas left
	assert len(df[df[col].str.contains(',,')])==0
	# set Split to Test if test is in it
	df['Split'] = df['Split'].apply(lambda x: 'Test' if 'Test' in x else 'Train')

	# For anything that wasn't duplicated, Value_STD is nan
	log_update("\tChecking coefficients of variation for averaged rows")
	# calculate coefficient of variation, should be < 10
	df['Value_CV'] = 100*df['Value_STD']/df['Value']
	log_update(f"\t\tTotal rows with coefficient of variation (CV)\n\t\t\t<=10%: {len(df[df['Value_CV']<=10])}\n\t\t\t>10%: {len(df[df['Value_CV']>10])}\n\t\t\t>20%: {len(df[df['Value_CV']>20])}")

	# Ensure there are no duplicates
	assert len(df[df['Sequence'].duplicated()])==0
	log_update(f"\tNo remaining duplicates: {len(df[df['Sequence'].duplicated()])==0}")

	# Print the final distribution of train and test values
	split_str = df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'})
	tot_prots = sum(split_str['count'])
	split_str['pcnt'] = round(100*split_str['count']/tot_prots,2)
	split_str = split_str.to_string(index=False)
	split_str = "\t\t" + split_str.replace("\n","\n\t\t")
	log_update(f"\tTotal proteins: {tot_prots}\n\tSplits:\n{split_str}")

	return df

	def combine_albatross_seqs(asph, scaled_re, scaled_rg, scaling_exp):
	log_update("\nCombining all four dataframes into one file of ALBATROSS sequences")

	asph = asph[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'asph'})
	scaled_re = scaled_re[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'scaled_re'})
	scaled_rg = scaled_rg[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'scaled_rg'})
	scaling_exp = scaling_exp[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'scaling_exp'})

	combined = asph.merge(scaled_re, on='Sequence',how='outer',suffixes=('_asph', '_scaledre'))\
	.merge(scaled_rg, on='Sequence',how='outer',suffixes=('_scaledre', '_scaledrg'))\
	.merge(scaling_exp, on='Sequence',how='outer',suffixes=('_scaledrg', '_scalingexp')).fillna('')

	# Make sure something that's in train for one is in train for all, and not test
	combined['IDs'] = combined['IDs_asph']+','+combined['IDs_scaledre']+','+combined['IDs_scaledrg']+','+combined['IDs_scalingexp']
	combined['UniProt_IDs'] = combined['UniProt_IDs_asph']+','+combined['UniProt_IDs_scaledre']+','+combined['UniProt_IDs_scaledrg']+','+combined['UniProt_IDs_scalingexp']
	combined['UniProt_Names'] = combined['UniProt_Names_asph']+','+combined['UniProt_Names_scaledre']+','+combined['UniProt_Names_scaledrg']+','+combined['UniProt_Names_scalingexp']
	combined['Split'] = combined['Split_asph']+','+combined['Split_scaledre']+','+combined['Split_scaledrg']+','+combined['Split_scalingexp']

	# Make the lists clean
	for col in ['IDs','UniProt_IDs','UniProt_Names','Split']:
	combined[col] = combined[col].apply(lambda x: [y.strip() for y in x.split(',') if len(y)>0])
	combined[col] = combined[col].apply(lambda x: ','.join(set(x)))
	combined[col] = combined[col].str.strip(',')
	# make sure there are no commas left
	assert len(combined[combined[col].str.contains(',,')])==0
	combined = combined[['Sequence','IDs','UniProt_IDs','UniProt_Names','Split','asph','scaled_re','scaled_rg','scaling_exp']] # drop unneeded merge relics
	combined = combined.replace('',np.nan)
	# Make sure there are no sequences where split is both train and test
	log_update("\tChecking for any cases where a protein is Train for one IDR prediction task and Test for another (should NOT happen!)")
	duplicates_df = combined.groupby('Sequence').agg({
	'Split': lambda x: ','.join(set(x))
	}).reset_index()
	duplicates_df = duplicates_df.loc[duplicates_df['Split'].str.contains(',')].reset_index(drop=True)
	duplicates = duplicates_df['Sequence'].unique().tolist()
	n_rows_with_duplicates = len(combined[combined['Sequence'].isin(duplicates)])
	log_update(f"\t\tsequences in BOTH TRAIN AND TEST: {len(duplicates)} sequences, corresponding to {n_rows_with_duplicates} rows")
	if len(duplicates)>0:
	log_update(f"\t\tprinting portion of assert len(combined[combined['asph'].notna()])==len(asph)dataframe with train+test shared seqs:\n{duplicates_df.head(5)}")

	# Now, get rid of duplicates
	combined = combined.drop_duplicates().reset_index(drop=True)
	duplicates = combined[combined.duplicated('Sequence')]['Sequence'].unique().tolist()
	log_update(f"\tDropped duplicates.\n\tTotal duplicate sequences: {len(duplicates)}\n\tTotal sequences: {len(combined)}")
	assert len(duplicates)==0

	# See how many columns have multiple entries for each
	log_update(f"\tChecking how many sequences have multiple of the following: ID, UniProt ID, UniProt Name")
	for col in ['IDs','UniProt_IDs','UniProt_Names','Split']:
	n_multiple = len(combined.loc[(combined[col].notna()) & (combined[col].str.contains(','))])
	log_update(f"\t\t{col}: {n_multiple}")

	# See how many entries there are of each cproperty (should match length of original database)
	assert len(combined[combined['asph'].notna()])==len(asph)
	assert len(combined[combined['scaled_re'].notna()])==len(scaled_re)
	assert len(combined[combined['scaled_rg'].notna()])==len(scaled_rg)
	assert len(combined[combined['scaling_exp'].notna()])==len(scaling_exp)
	log_update("\tSequences with values for each property:")
	for property in ['asph','scaled_re','scaled_rg','scaling_exp']:
	log_update(f"\t\t{property}: {len(combined[combined[property].notna()])}")

	log_update(f"\nPreview of combined database with columns: {combined.columns}\n{combined.head(10)}")
	return combined

	def main():
	with open_logfile("data_cleaning_log.txt"):
	# Read in all of the raw data
	raw_data_folder = 'raw_data'
	dtype_dict = {0:str,1:str,2:float}
	rename_dict = {0:'ID',1:'Sequence',2:'Value'}

	# Read in the test data
	asph_test = pd.read_csv(f"{raw_data_folder}/asph_nat_meth_test.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict)
	scaled_re_test = pd.read_csv(f"{raw_data_folder}/scaled_re_nat_meth_test.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict)
	scaled_rg_test = pd.read_csv(f"{raw_data_folder}/scaled_rg_nat_meth_test.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict)
	scaling_exp_test = pd.read_csv(f"{raw_data_folder}/scaling_exp_nat_meth_test.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict)

	# Read in the train data
	asph_train = pd.read_csv(f"{raw_data_folder}/asph_bio_synth_training_data_cleaned_05_09_2023.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict)
	scaled_re_train = pd.read_csv(f"{raw_data_folder}/scaled_re_bio_synth_training_data_cleaned_05_09_2023.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict)
	scaled_rg_train = pd.read_csv(f"{raw_data_folder}/scaled_rg_bio_synth_training_data_cleaned_05_09_2023.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict)
	scaling_exp_train = pd.read_csv(f"{raw_data_folder}/scaling_exp_bio_synth_training_data_cleaned_05_09_2023.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict)

	# Concatenate - include columns for split
	asph_test['Split'] = ['Test']*len(asph_test)
	scaled_re_test['Split'] = ['Test']*len(scaled_re_test)
	scaled_rg_test['Split'] = ['Test']*len(scaled_rg_test)
	scaling_exp_test['Split'] = ['Test']*len(scaling_exp_test)

	asph_train['Split'] = ['Train']*len(asph_train)
	scaled_re_train['Split'] = ['Train']*len(scaled_re_train)
	scaled_rg_train['Split'] = ['Train']*len(scaled_rg_train)
	scaling_exp_train['Split'] = ['Train']*len(scaling_exp_train)

	asph = pd.concat([asph_test, asph_train])
	scaled_re = pd.concat([scaled_re_test, scaled_re_train])
	scaled_rg = pd.concat([scaled_rg_test, scaled_rg_train])
	scaling_exp = pd.concat([scaling_exp_test, scaling_exp_train])

	log_update("Initial counts:")
	log_update(f"\tAsphericity: total entries={len(asph)}, not nan entries={len(asph.loc[asph['Value'].notna()])}")
	log_update(f"\tScaled re: total entries={len(scaled_re)}, not nan entries={len(scaled_re.loc[scaled_re['Value'].notna()])}")
	log_update(f"\tScaled rg: total entries={len(scaled_rg)}, not nan entries={len(scaled_rg.loc[scaled_rg['Value'].notna()])}")
	# change any scaled_rg rows with values less than 1 to np.nan, as done in the paper
	scaled_rg = scaled_rg.loc[
	scaled_rg['Value']>=1].reset_index(drop=True)
	log_update(f"\t\tAfter dropping Rg values < 1: total entries={len(scaled_rg)}")
	log_update(f"\tScaling exp: total entries={len(scaling_exp)}, not nan entries={len(scaling_exp.loc[scaling_exp['Value'].notna()])}")

	# Process the raw data
	log_update(f"Example raw download: asphericity\n{asph.head()}")
	log_update(f"\nCleaning Asphericity")
	asph = process_raw_albatross(asph)
	log_update(f"\nProcessed data: asphericity\n{asph.head()}")

	log_update(f"\nCleaning Scaled Re")
	scaled_re = process_raw_albatross(scaled_re)
	log_update(f"\nProcessed data: scaled re\n{scaled_re.head()}")

	log_update(f"\nCleaning Scaled Rg")
	scaled_rg = process_raw_albatross(scaled_rg)
	log_update(f"\nProcessed data: scaled rg\n{scaled_rg.head()}")

	log_update(f"\nCleaning Scaling Exp")
	scaling_exp = process_raw_albatross(scaling_exp)
	log_update(f"\nProcessed data: scaling exp\n{scaling_exp.head()}")

	# Give some stats about each dataset
	log_update("\nStats:")
	log_update(f"# Asphericity sequences: {len(asph)}\n\tRange: {min(asph['Value']):.4f}-{max(asph['Value']):.4f}")
	log_update(f"# Scaled Re sequences: {len(scaled_re)}\n\tRange: {min(scaled_re['Value']):.4f}-{max(scaled_re['Value']):.4f}")
	log_update(f"# Scaled Rg sequences: {len(scaled_rg)}\n\tRange: {min(scaled_rg['Value']):.4f}-{max(scaled_rg['Value']):.4f}")
	log_update(f"# Scaling Exponent sequences: {len(scaling_exp)}\n\tRange: {min(scaling_exp['Value']):.4f}-{max(scaling_exp['Value']):.4f}")

	# Combine
	combined = combine_albatross_seqs(asph, scaled_re, scaled_rg, scaling_exp)

	# Save processed data
	proc_folder = "processed_data"
	os.makedirs(proc_folder,exist_ok=True)
	combined.to_csv(f"{proc_folder}/all_albatross_seqs_and_properties.csv",index=False)

	# Plot the data distribution and save it
	values_dict = {
	'Asphericity': asph['Value'].tolist(),
	'End-to-End Distance (Re)': scaled_re['Value'].tolist(),
	'Radius of Gyration (Rg)': scaled_rg['Value'].tolist(),
	'Scaling Exponent': scaling_exp['Value'].tolist()
	}
	train_test_values_dict = {
	'Asphericity': {
	'train': asph[asph['Split']=='Train']['Value'].tolist(),
	'test': asph[asph['Split']=='Test']['Value'].tolist()},
	'End-to-End Distance (Re)': {
	'train': scaled_re[scaled_re['Split']=='Train']['Value'].tolist(),
	'test': scaled_re[scaled_re['Split']=='Test']['Value'].tolist()},
	'Radius of Gyration (Rg)': {
	'train': scaled_rg[scaled_rg['Split']=='Train']['Value'].tolist(),
	'test': scaled_rg[scaled_rg['Split']=='Test']['Value'].tolist()},
	'Scaling Exponent': {
	'train': scaling_exp[scaling_exp['Split']=='Train']['Value'].tolist(),
	'test': scaling_exp[scaling_exp['Split']=='Test']['Value'].tolist()},
	}
	plot_all_values_hist_grid(values_dict, save_path="processed_data/value_histograms.png")
	plot_all_train_val_test_values_hist_grid(train_test_values_dict, save_path="processed_data/train_test_value_histograms.png")

	if __name__ == "__main__":
	main()