Fill-Mask
Transformers
Safetensors
esm
Sophia Vincoff
fixed READMEs and added IDR Prediction benchmark
e048d40
import pandas as pd
import numpy as np
import os
from fuson_plm.utils.logging import open_logfile, log_update
from fuson_plm.utils.constants import DELIMITERS, VALID_AAS
from fuson_plm.utils.data_cleaning import check_columns_for_listlike, find_invalid_chars
from fuson_plm.benchmarking.idr_prediction.plot import plot_all_values_hist_grid, plot_all_train_val_test_values_hist_grid
def process_raw_albatross(df):
# return a version of the df with first column split, duplicates cleaned,columns checked for weird characters and invalids
# first, look at the splits
split_str = df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'})
tot_prots = sum(split_str['count'])
split_str['pcnt'] = round(100*split_str['count']/tot_prots,2)
split_str = split_str.to_string(index=False)
split_str = "\t\t" + split_str.replace("\n","\n\t\t")
log_update(f"\tTotal proteins: {tot_prots}\n\tSplits:\n{split_str}")
# format: IDR_19076_tr___A0A8M9PNM5___A0A8M9PNM5_DANRE
# or: synth_test_sequence0
df['temp'] = df['ID'].str.split("_")
df['ID'] = df['temp'].apply(lambda x: f"{x[0]}" if len(x)==1 else f"{x[0]}_{x[1]}" if len(x)<3 else f"{x[0]}_{x[1]}_{x[2]}")
# Not ever column has UniProt IDs and Names, so we have to allow np.nan if this info is missing.
df['UniProt_ID'] = df['temp'].apply(lambda x: x[5].strip() if len(x)>=5 else np.nan)
df['UniProt_Name'] = df['temp'].apply(lambda x: f"{x[8].strip()}_{x[9].strip()}" if len(x)>=8 else np.nan)
df = df.drop(columns=['temp'])
cols_to_check = list(df.columns)
cols_to_check.remove('Value') # don't check this one because it shouldn't be string
# Investigate the colimns we just created and make sure they don't have any invalid features.
# make sure value is float type
assert df['Value'].dtype == 'float64'
check_columns_for_listlike(df, cols_of_interest=cols_to_check, delimiters=DELIMITERS)
# Check for invalid AAs
df['invalid_chars'] = df['Sequence'].apply(lambda x: find_invalid_chars(x, VALID_AAS))
df[df['invalid_chars'].str.len()>0].sort_values(by='Sequence')
all_invalid_chars = set().union(*df['invalid_chars'])
log_update(f"\tchecking for invalid characters...\n\t\tset of all invalid characters discovered within train_df: {all_invalid_chars}")
# Assert no invalid AAs
assert (df['invalid_chars'].str.len()==0).all()
df = df.drop(columns=['invalid_chars'])
# Check for duplicates - if we find any, REMOVE them from train and keep them in test
duplicates = df[df.duplicated('Sequence')]['Sequence'].unique().tolist()
n_rows_with_duplicates = len(df[df['Sequence'].isin(duplicates)])
log_update(f"\t{len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} rows")
# Look for distribution of duplicates WITHIN train, WITHIN test, and BETWEEN train and test
# Train only
duplicates = df.loc[
(df['Split']=='Train')
]
duplicates = duplicates[duplicates.duplicated('Sequence')]['Sequence'].unique().tolist()
n_rows_with_duplicates = len(df.loc[
(df['Sequence'].isin(duplicates)) &
(df['Split']=='Train')
])
log_update(f"\t\twithin TRAIN only: {len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} Train rows")
# Test only
duplicates = df.loc[
(df['Split']=='Test')
]
duplicates = duplicates[duplicates.duplicated('Sequence')]['Sequence'].unique().tolist()
n_rows_with_duplicates = len(df.loc[
(df['Sequence'].isin(duplicates)) &
(df['Split']=='Test')
])
log_update(f"\t\twithin TEST only: {len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} Test rows")
# Between train and test
duplicates_df = df.groupby('Sequence').agg({
'Split': lambda x: ','.join(set(x))
}).reset_index()
duplicates_df = duplicates_df.loc[duplicates_df['Split'].str.contains(',')].reset_index(drop=True)
duplicates = duplicates_df['Sequence'].unique().tolist()
n_rows_with_duplicates = len(df[df['Sequence'].isin(duplicates)])
log_update(f"\t\tduplicates in BOTH TRAIN AND TEST: {len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} rows")
log_update(f"\t\tprinting portion of dataframe with train+test shared seqs:\n{duplicates_df.head(5)}")
log_update("\tGrouping by sequence, averaging values, and keeping any Train/Test duplicates in the Test set...")
df = df.replace(np.nan, '')
df = df.groupby('Sequence').agg(
Value=('Value', 'mean'),
Value_STD=('Value', 'std'),
IDs=('ID', lambda x: ','.join(x)),
UniProt_IDs=('UniProt_ID', lambda x: ','.join(x)),
UniProt_Names=('UniProt_Name', lambda x: ','.join(x)),
Split=('Split', lambda x: ','.join(x))
).reset_index()
for col in ['IDs','UniProt_IDs','UniProt_Names','Split']:
df[col] = df[col].apply(lambda x: [y for y in x.split(',')])
df[col] = df[col].apply(lambda x: ','.join(x))
df[col] = df[col].str.strip(',')
# make sure there are no commas left
assert len(df[df[col].str.contains(',,')])==0
# set Split to Test if test is in it
df['Split'] = df['Split'].apply(lambda x: 'Test' if 'Test' in x else 'Train')
# For anything that wasn't duplicated, Value_STD is nan
log_update("\tChecking coefficients of variation for averaged rows")
# calculate coefficient of variation, should be < 10
df['Value_CV'] = 100*df['Value_STD']/df['Value']
log_update(f"\t\tTotal rows with coefficient of variation (CV)\n\t\t\t<=10%: {len(df[df['Value_CV']<=10])}\n\t\t\t>10%: {len(df[df['Value_CV']>10])}\n\t\t\t>20%: {len(df[df['Value_CV']>20])}")
# Ensure there are no duplicates
assert len(df[df['Sequence'].duplicated()])==0
log_update(f"\tNo remaining duplicates: {len(df[df['Sequence'].duplicated()])==0}")
# Print the final distribution of train and test values
split_str = df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'})
tot_prots = sum(split_str['count'])
split_str['pcnt'] = round(100*split_str['count']/tot_prots,2)
split_str = split_str.to_string(index=False)
split_str = "\t\t" + split_str.replace("\n","\n\t\t")
log_update(f"\tTotal proteins: {tot_prots}\n\tSplits:\n{split_str}")
return df
def combine_albatross_seqs(asph, scaled_re, scaled_rg, scaling_exp):
log_update("\nCombining all four dataframes into one file of ALBATROSS sequences")
asph = asph[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'asph'})
scaled_re = scaled_re[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'scaled_re'})
scaled_rg = scaled_rg[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'scaled_rg'})
scaling_exp = scaling_exp[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'scaling_exp'})
combined = asph.merge(scaled_re, on='Sequence',how='outer',suffixes=('_asph', '_scaledre'))\
.merge(scaled_rg, on='Sequence',how='outer',suffixes=('_scaledre', '_scaledrg'))\
.merge(scaling_exp, on='Sequence',how='outer',suffixes=('_scaledrg', '_scalingexp')).fillna('')
# Make sure something that's in train for one is in train for all, and not test
combined['IDs'] = combined['IDs_asph']+','+combined['IDs_scaledre']+','+combined['IDs_scaledrg']+','+combined['IDs_scalingexp']
combined['UniProt_IDs'] = combined['UniProt_IDs_asph']+','+combined['UniProt_IDs_scaledre']+','+combined['UniProt_IDs_scaledrg']+','+combined['UniProt_IDs_scalingexp']
combined['UniProt_Names'] = combined['UniProt_Names_asph']+','+combined['UniProt_Names_scaledre']+','+combined['UniProt_Names_scaledrg']+','+combined['UniProt_Names_scalingexp']
combined['Split'] = combined['Split_asph']+','+combined['Split_scaledre']+','+combined['Split_scaledrg']+','+combined['Split_scalingexp']
# Make the lists clean
for col in ['IDs','UniProt_IDs','UniProt_Names','Split']:
combined[col] = combined[col].apply(lambda x: [y.strip() for y in x.split(',') if len(y)>0])
combined[col] = combined[col].apply(lambda x: ','.join(set(x)))
combined[col] = combined[col].str.strip(',')
# make sure there are no commas left
assert len(combined[combined[col].str.contains(',,')])==0
combined = combined[['Sequence','IDs','UniProt_IDs','UniProt_Names','Split','asph','scaled_re','scaled_rg','scaling_exp']] # drop unneeded merge relics
combined = combined.replace('',np.nan)
# Make sure there are no sequences where split is both train and test
log_update("\tChecking for any cases where a protein is Train for one IDR prediction task and Test for another (should NOT happen!)")
duplicates_df = combined.groupby('Sequence').agg({
'Split': lambda x: ','.join(set(x))
}).reset_index()
duplicates_df = duplicates_df.loc[duplicates_df['Split'].str.contains(',')].reset_index(drop=True)
duplicates = duplicates_df['Sequence'].unique().tolist()
n_rows_with_duplicates = len(combined[combined['Sequence'].isin(duplicates)])
log_update(f"\t\tsequences in BOTH TRAIN AND TEST: {len(duplicates)} sequences, corresponding to {n_rows_with_duplicates} rows")
if len(duplicates)>0:
log_update(f"\t\tprinting portion of assert len(combined[combined['asph'].notna()])==len(asph)dataframe with train+test shared seqs:\n{duplicates_df.head(5)}")
# Now, get rid of duplicates
combined = combined.drop_duplicates().reset_index(drop=True)
duplicates = combined[combined.duplicated('Sequence')]['Sequence'].unique().tolist()
log_update(f"\tDropped duplicates.\n\tTotal duplicate sequences: {len(duplicates)}\n\tTotal sequences: {len(combined)}")
assert len(duplicates)==0
# See how many columns have multiple entries for each
log_update(f"\tChecking how many sequences have multiple of the following: ID, UniProt ID, UniProt Name")
for col in ['IDs','UniProt_IDs','UniProt_Names','Split']:
n_multiple = len(combined.loc[(combined[col].notna()) & (combined[col].str.contains(','))])
log_update(f"\t\t{col}: {n_multiple}")
# See how many entries there are of each cproperty (should match length of original database)
assert len(combined[combined['asph'].notna()])==len(asph)
assert len(combined[combined['scaled_re'].notna()])==len(scaled_re)
assert len(combined[combined['scaled_rg'].notna()])==len(scaled_rg)
assert len(combined[combined['scaling_exp'].notna()])==len(scaling_exp)
log_update("\tSequences with values for each property:")
for property in ['asph','scaled_re','scaled_rg','scaling_exp']:
log_update(f"\t\t{property}: {len(combined[combined[property].notna()])}")
log_update(f"\nPreview of combined database with columns: {combined.columns}\n{combined.head(10)}")
return combined
def main():
with open_logfile("data_cleaning_log.txt"):
# Read in all of the raw data
raw_data_folder = 'raw_data'
dtype_dict = {0:str,1:str,2:float}
rename_dict = {0:'ID',1:'Sequence',2:'Value'}
# Read in the test data
asph_test = pd.read_csv(f"{raw_data_folder}/asph_nat_meth_test.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict)
scaled_re_test = pd.read_csv(f"{raw_data_folder}/scaled_re_nat_meth_test.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict)
scaled_rg_test = pd.read_csv(f"{raw_data_folder}/scaled_rg_nat_meth_test.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict)
scaling_exp_test = pd.read_csv(f"{raw_data_folder}/scaling_exp_nat_meth_test.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict)
# Read in the train data
asph_train = pd.read_csv(f"{raw_data_folder}/asph_bio_synth_training_data_cleaned_05_09_2023.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict)
scaled_re_train = pd.read_csv(f"{raw_data_folder}/scaled_re_bio_synth_training_data_cleaned_05_09_2023.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict)
scaled_rg_train = pd.read_csv(f"{raw_data_folder}/scaled_rg_bio_synth_training_data_cleaned_05_09_2023.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict)
scaling_exp_train = pd.read_csv(f"{raw_data_folder}/scaling_exp_bio_synth_training_data_cleaned_05_09_2023.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict)
# Concatenate - include columns for split
asph_test['Split'] = ['Test']*len(asph_test)
scaled_re_test['Split'] = ['Test']*len(scaled_re_test)
scaled_rg_test['Split'] = ['Test']*len(scaled_rg_test)
scaling_exp_test['Split'] = ['Test']*len(scaling_exp_test)
asph_train['Split'] = ['Train']*len(asph_train)
scaled_re_train['Split'] = ['Train']*len(scaled_re_train)
scaled_rg_train['Split'] = ['Train']*len(scaled_rg_train)
scaling_exp_train['Split'] = ['Train']*len(scaling_exp_train)
asph = pd.concat([asph_test, asph_train])
scaled_re = pd.concat([scaled_re_test, scaled_re_train])
scaled_rg = pd.concat([scaled_rg_test, scaled_rg_train])
scaling_exp = pd.concat([scaling_exp_test, scaling_exp_train])
log_update("Initial counts:")
log_update(f"\tAsphericity: total entries={len(asph)}, not nan entries={len(asph.loc[asph['Value'].notna()])}")
log_update(f"\tScaled re: total entries={len(scaled_re)}, not nan entries={len(scaled_re.loc[scaled_re['Value'].notna()])}")
log_update(f"\tScaled rg: total entries={len(scaled_rg)}, not nan entries={len(scaled_rg.loc[scaled_rg['Value'].notna()])}")
# change any scaled_rg rows with values less than 1 to np.nan, as done in the paper
scaled_rg = scaled_rg.loc[
scaled_rg['Value']>=1].reset_index(drop=True)
log_update(f"\t\tAfter dropping Rg values < 1: total entries={len(scaled_rg)}")
log_update(f"\tScaling exp: total entries={len(scaling_exp)}, not nan entries={len(scaling_exp.loc[scaling_exp['Value'].notna()])}")
# Process the raw data
log_update(f"Example raw download: asphericity\n{asph.head()}")
log_update(f"\nCleaning Asphericity")
asph = process_raw_albatross(asph)
log_update(f"\nProcessed data: asphericity\n{asph.head()}")
log_update(f"\nCleaning Scaled Re")
scaled_re = process_raw_albatross(scaled_re)
log_update(f"\nProcessed data: scaled re\n{scaled_re.head()}")
log_update(f"\nCleaning Scaled Rg")
scaled_rg = process_raw_albatross(scaled_rg)
log_update(f"\nProcessed data: scaled rg\n{scaled_rg.head()}")
log_update(f"\nCleaning Scaling Exp")
scaling_exp = process_raw_albatross(scaling_exp)
log_update(f"\nProcessed data: scaling exp\n{scaling_exp.head()}")
# Give some stats about each dataset
log_update("\nStats:")
log_update(f"# Asphericity sequences: {len(asph)}\n\tRange: {min(asph['Value']):.4f}-{max(asph['Value']):.4f}")
log_update(f"# Scaled Re sequences: {len(scaled_re)}\n\tRange: {min(scaled_re['Value']):.4f}-{max(scaled_re['Value']):.4f}")
log_update(f"# Scaled Rg sequences: {len(scaled_rg)}\n\tRange: {min(scaled_rg['Value']):.4f}-{max(scaled_rg['Value']):.4f}")
log_update(f"# Scaling Exponent sequences: {len(scaling_exp)}\n\tRange: {min(scaling_exp['Value']):.4f}-{max(scaling_exp['Value']):.4f}")
# Combine
combined = combine_albatross_seqs(asph, scaled_re, scaled_rg, scaling_exp)
# Save processed data
proc_folder = "processed_data"
os.makedirs(proc_folder,exist_ok=True)
combined.to_csv(f"{proc_folder}/all_albatross_seqs_and_properties.csv",index=False)
# Plot the data distribution and save it
values_dict = {
'Asphericity': asph['Value'].tolist(),
'End-to-End Distance (Re)': scaled_re['Value'].tolist(),
'Radius of Gyration (Rg)': scaled_rg['Value'].tolist(),
'Scaling Exponent': scaling_exp['Value'].tolist()
}
train_test_values_dict = {
'Asphericity': {
'train': asph[asph['Split']=='Train']['Value'].tolist(),
'test': asph[asph['Split']=='Test']['Value'].tolist()},
'End-to-End Distance (Re)': {
'train': scaled_re[scaled_re['Split']=='Train']['Value'].tolist(),
'test': scaled_re[scaled_re['Split']=='Test']['Value'].tolist()},
'Radius of Gyration (Rg)': {
'train': scaled_rg[scaled_rg['Split']=='Train']['Value'].tolist(),
'test': scaled_rg[scaled_rg['Split']=='Test']['Value'].tolist()},
'Scaling Exponent': {
'train': scaling_exp[scaling_exp['Split']=='Train']['Value'].tolist(),
'test': scaling_exp[scaling_exp['Split']=='Test']['Value'].tolist()},
}
plot_all_values_hist_grid(values_dict, save_path="processed_data/value_histograms.png")
plot_all_train_val_test_values_hist_grid(train_test_values_dict, save_path="processed_data/train_test_value_histograms.png")
if __name__ == "__main__":
main()