|
import pandas as pd |
|
import numpy as np |
|
import os |
|
from fuson_plm.utils.logging import open_logfile, log_update |
|
from fuson_plm.utils.constants import DELIMITERS, VALID_AAS |
|
from fuson_plm.utils.data_cleaning import check_columns_for_listlike, find_invalid_chars |
|
from fuson_plm.benchmarking.idr_prediction.plot import plot_all_values_hist_grid, plot_all_train_val_test_values_hist_grid |
|
|
|
def process_raw_albatross(df): |
|
|
|
|
|
|
|
split_str = df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'}) |
|
tot_prots = sum(split_str['count']) |
|
split_str['pcnt'] = round(100*split_str['count']/tot_prots,2) |
|
split_str = split_str.to_string(index=False) |
|
split_str = "\t\t" + split_str.replace("\n","\n\t\t") |
|
log_update(f"\tTotal proteins: {tot_prots}\n\tSplits:\n{split_str}") |
|
|
|
|
|
|
|
df['temp'] = df['ID'].str.split("_") |
|
df['ID'] = df['temp'].apply(lambda x: f"{x[0]}" if len(x)==1 else f"{x[0]}_{x[1]}" if len(x)<3 else f"{x[0]}_{x[1]}_{x[2]}") |
|
|
|
df['UniProt_ID'] = df['temp'].apply(lambda x: x[5].strip() if len(x)>=5 else np.nan) |
|
df['UniProt_Name'] = df['temp'].apply(lambda x: f"{x[8].strip()}_{x[9].strip()}" if len(x)>=8 else np.nan) |
|
df = df.drop(columns=['temp']) |
|
|
|
cols_to_check = list(df.columns) |
|
cols_to_check.remove('Value') |
|
|
|
|
|
assert df['Value'].dtype == 'float64' |
|
check_columns_for_listlike(df, cols_of_interest=cols_to_check, delimiters=DELIMITERS) |
|
|
|
|
|
df['invalid_chars'] = df['Sequence'].apply(lambda x: find_invalid_chars(x, VALID_AAS)) |
|
df[df['invalid_chars'].str.len()>0].sort_values(by='Sequence') |
|
all_invalid_chars = set().union(*df['invalid_chars']) |
|
log_update(f"\tchecking for invalid characters...\n\t\tset of all invalid characters discovered within train_df: {all_invalid_chars}") |
|
|
|
|
|
assert (df['invalid_chars'].str.len()==0).all() |
|
df = df.drop(columns=['invalid_chars']) |
|
|
|
|
|
duplicates = df[df.duplicated('Sequence')]['Sequence'].unique().tolist() |
|
n_rows_with_duplicates = len(df[df['Sequence'].isin(duplicates)]) |
|
log_update(f"\t{len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} rows") |
|
|
|
|
|
|
|
duplicates = df.loc[ |
|
(df['Split']=='Train') |
|
] |
|
duplicates = duplicates[duplicates.duplicated('Sequence')]['Sequence'].unique().tolist() |
|
n_rows_with_duplicates = len(df.loc[ |
|
(df['Sequence'].isin(duplicates)) & |
|
(df['Split']=='Train') |
|
]) |
|
log_update(f"\t\twithin TRAIN only: {len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} Train rows") |
|
|
|
|
|
duplicates = df.loc[ |
|
(df['Split']=='Test') |
|
] |
|
duplicates = duplicates[duplicates.duplicated('Sequence')]['Sequence'].unique().tolist() |
|
n_rows_with_duplicates = len(df.loc[ |
|
(df['Sequence'].isin(duplicates)) & |
|
(df['Split']=='Test') |
|
]) |
|
log_update(f"\t\twithin TEST only: {len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} Test rows") |
|
|
|
|
|
duplicates_df = df.groupby('Sequence').agg({ |
|
'Split': lambda x: ','.join(set(x)) |
|
}).reset_index() |
|
duplicates_df = duplicates_df.loc[duplicates_df['Split'].str.contains(',')].reset_index(drop=True) |
|
duplicates = duplicates_df['Sequence'].unique().tolist() |
|
n_rows_with_duplicates = len(df[df['Sequence'].isin(duplicates)]) |
|
log_update(f"\t\tduplicates in BOTH TRAIN AND TEST: {len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} rows") |
|
log_update(f"\t\tprinting portion of dataframe with train+test shared seqs:\n{duplicates_df.head(5)}") |
|
|
|
log_update("\tGrouping by sequence, averaging values, and keeping any Train/Test duplicates in the Test set...") |
|
df = df.replace(np.nan, '') |
|
df = df.groupby('Sequence').agg( |
|
Value=('Value', 'mean'), |
|
Value_STD=('Value', 'std'), |
|
IDs=('ID', lambda x: ','.join(x)), |
|
UniProt_IDs=('UniProt_ID', lambda x: ','.join(x)), |
|
UniProt_Names=('UniProt_Name', lambda x: ','.join(x)), |
|
Split=('Split', lambda x: ','.join(x)) |
|
).reset_index() |
|
for col in ['IDs','UniProt_IDs','UniProt_Names','Split']: |
|
df[col] = df[col].apply(lambda x: [y for y in x.split(',')]) |
|
df[col] = df[col].apply(lambda x: ','.join(x)) |
|
df[col] = df[col].str.strip(',') |
|
|
|
assert len(df[df[col].str.contains(',,')])==0 |
|
|
|
df['Split'] = df['Split'].apply(lambda x: 'Test' if 'Test' in x else 'Train') |
|
|
|
|
|
log_update("\tChecking coefficients of variation for averaged rows") |
|
|
|
df['Value_CV'] = 100*df['Value_STD']/df['Value'] |
|
log_update(f"\t\tTotal rows with coefficient of variation (CV)\n\t\t\t<=10%: {len(df[df['Value_CV']<=10])}\n\t\t\t>10%: {len(df[df['Value_CV']>10])}\n\t\t\t>20%: {len(df[df['Value_CV']>20])}") |
|
|
|
|
|
assert len(df[df['Sequence'].duplicated()])==0 |
|
log_update(f"\tNo remaining duplicates: {len(df[df['Sequence'].duplicated()])==0}") |
|
|
|
|
|
split_str = df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'}) |
|
tot_prots = sum(split_str['count']) |
|
split_str['pcnt'] = round(100*split_str['count']/tot_prots,2) |
|
split_str = split_str.to_string(index=False) |
|
split_str = "\t\t" + split_str.replace("\n","\n\t\t") |
|
log_update(f"\tTotal proteins: {tot_prots}\n\tSplits:\n{split_str}") |
|
|
|
return df |
|
|
|
def combine_albatross_seqs(asph, scaled_re, scaled_rg, scaling_exp): |
|
log_update("\nCombining all four dataframes into one file of ALBATROSS sequences") |
|
|
|
asph = asph[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'asph'}) |
|
scaled_re = scaled_re[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'scaled_re'}) |
|
scaled_rg = scaled_rg[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'scaled_rg'}) |
|
scaling_exp = scaling_exp[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'scaling_exp'}) |
|
|
|
combined = asph.merge(scaled_re, on='Sequence',how='outer',suffixes=('_asph', '_scaledre'))\ |
|
.merge(scaled_rg, on='Sequence',how='outer',suffixes=('_scaledre', '_scaledrg'))\ |
|
.merge(scaling_exp, on='Sequence',how='outer',suffixes=('_scaledrg', '_scalingexp')).fillna('') |
|
|
|
|
|
combined['IDs'] = combined['IDs_asph']+','+combined['IDs_scaledre']+','+combined['IDs_scaledrg']+','+combined['IDs_scalingexp'] |
|
combined['UniProt_IDs'] = combined['UniProt_IDs_asph']+','+combined['UniProt_IDs_scaledre']+','+combined['UniProt_IDs_scaledrg']+','+combined['UniProt_IDs_scalingexp'] |
|
combined['UniProt_Names'] = combined['UniProt_Names_asph']+','+combined['UniProt_Names_scaledre']+','+combined['UniProt_Names_scaledrg']+','+combined['UniProt_Names_scalingexp'] |
|
combined['Split'] = combined['Split_asph']+','+combined['Split_scaledre']+','+combined['Split_scaledrg']+','+combined['Split_scalingexp'] |
|
|
|
|
|
for col in ['IDs','UniProt_IDs','UniProt_Names','Split']: |
|
combined[col] = combined[col].apply(lambda x: [y.strip() for y in x.split(',') if len(y)>0]) |
|
combined[col] = combined[col].apply(lambda x: ','.join(set(x))) |
|
combined[col] = combined[col].str.strip(',') |
|
|
|
assert len(combined[combined[col].str.contains(',,')])==0 |
|
combined = combined[['Sequence','IDs','UniProt_IDs','UniProt_Names','Split','asph','scaled_re','scaled_rg','scaling_exp']] |
|
combined = combined.replace('',np.nan) |
|
|
|
log_update("\tChecking for any cases where a protein is Train for one IDR prediction task and Test for another (should NOT happen!)") |
|
duplicates_df = combined.groupby('Sequence').agg({ |
|
'Split': lambda x: ','.join(set(x)) |
|
}).reset_index() |
|
duplicates_df = duplicates_df.loc[duplicates_df['Split'].str.contains(',')].reset_index(drop=True) |
|
duplicates = duplicates_df['Sequence'].unique().tolist() |
|
n_rows_with_duplicates = len(combined[combined['Sequence'].isin(duplicates)]) |
|
log_update(f"\t\tsequences in BOTH TRAIN AND TEST: {len(duplicates)} sequences, corresponding to {n_rows_with_duplicates} rows") |
|
if len(duplicates)>0: |
|
log_update(f"\t\tprinting portion of assert len(combined[combined['asph'].notna()])==len(asph)dataframe with train+test shared seqs:\n{duplicates_df.head(5)}") |
|
|
|
|
|
combined = combined.drop_duplicates().reset_index(drop=True) |
|
duplicates = combined[combined.duplicated('Sequence')]['Sequence'].unique().tolist() |
|
log_update(f"\tDropped duplicates.\n\tTotal duplicate sequences: {len(duplicates)}\n\tTotal sequences: {len(combined)}") |
|
assert len(duplicates)==0 |
|
|
|
|
|
log_update(f"\tChecking how many sequences have multiple of the following: ID, UniProt ID, UniProt Name") |
|
for col in ['IDs','UniProt_IDs','UniProt_Names','Split']: |
|
n_multiple = len(combined.loc[(combined[col].notna()) & (combined[col].str.contains(','))]) |
|
log_update(f"\t\t{col}: {n_multiple}") |
|
|
|
|
|
assert len(combined[combined['asph'].notna()])==len(asph) |
|
assert len(combined[combined['scaled_re'].notna()])==len(scaled_re) |
|
assert len(combined[combined['scaled_rg'].notna()])==len(scaled_rg) |
|
assert len(combined[combined['scaling_exp'].notna()])==len(scaling_exp) |
|
log_update("\tSequences with values for each property:") |
|
for property in ['asph','scaled_re','scaled_rg','scaling_exp']: |
|
log_update(f"\t\t{property}: {len(combined[combined[property].notna()])}") |
|
|
|
log_update(f"\nPreview of combined database with columns: {combined.columns}\n{combined.head(10)}") |
|
return combined |
|
|
|
def main(): |
|
with open_logfile("data_cleaning_log.txt"): |
|
|
|
raw_data_folder = 'raw_data' |
|
dtype_dict = {0:str,1:str,2:float} |
|
rename_dict = {0:'ID',1:'Sequence',2:'Value'} |
|
|
|
|
|
asph_test = pd.read_csv(f"{raw_data_folder}/asph_nat_meth_test.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict) |
|
scaled_re_test = pd.read_csv(f"{raw_data_folder}/scaled_re_nat_meth_test.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict) |
|
scaled_rg_test = pd.read_csv(f"{raw_data_folder}/scaled_rg_nat_meth_test.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict) |
|
scaling_exp_test = pd.read_csv(f"{raw_data_folder}/scaling_exp_nat_meth_test.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict) |
|
|
|
|
|
asph_train = pd.read_csv(f"{raw_data_folder}/asph_bio_synth_training_data_cleaned_05_09_2023.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict) |
|
scaled_re_train = pd.read_csv(f"{raw_data_folder}/scaled_re_bio_synth_training_data_cleaned_05_09_2023.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict) |
|
scaled_rg_train = pd.read_csv(f"{raw_data_folder}/scaled_rg_bio_synth_training_data_cleaned_05_09_2023.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict) |
|
scaling_exp_train = pd.read_csv(f"{raw_data_folder}/scaling_exp_bio_synth_training_data_cleaned_05_09_2023.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict) |
|
|
|
|
|
asph_test['Split'] = ['Test']*len(asph_test) |
|
scaled_re_test['Split'] = ['Test']*len(scaled_re_test) |
|
scaled_rg_test['Split'] = ['Test']*len(scaled_rg_test) |
|
scaling_exp_test['Split'] = ['Test']*len(scaling_exp_test) |
|
|
|
asph_train['Split'] = ['Train']*len(asph_train) |
|
scaled_re_train['Split'] = ['Train']*len(scaled_re_train) |
|
scaled_rg_train['Split'] = ['Train']*len(scaled_rg_train) |
|
scaling_exp_train['Split'] = ['Train']*len(scaling_exp_train) |
|
|
|
asph = pd.concat([asph_test, asph_train]) |
|
scaled_re = pd.concat([scaled_re_test, scaled_re_train]) |
|
scaled_rg = pd.concat([scaled_rg_test, scaled_rg_train]) |
|
scaling_exp = pd.concat([scaling_exp_test, scaling_exp_train]) |
|
|
|
log_update("Initial counts:") |
|
log_update(f"\tAsphericity: total entries={len(asph)}, not nan entries={len(asph.loc[asph['Value'].notna()])}") |
|
log_update(f"\tScaled re: total entries={len(scaled_re)}, not nan entries={len(scaled_re.loc[scaled_re['Value'].notna()])}") |
|
log_update(f"\tScaled rg: total entries={len(scaled_rg)}, not nan entries={len(scaled_rg.loc[scaled_rg['Value'].notna()])}") |
|
|
|
scaled_rg = scaled_rg.loc[ |
|
scaled_rg['Value']>=1].reset_index(drop=True) |
|
log_update(f"\t\tAfter dropping Rg values < 1: total entries={len(scaled_rg)}") |
|
log_update(f"\tScaling exp: total entries={len(scaling_exp)}, not nan entries={len(scaling_exp.loc[scaling_exp['Value'].notna()])}") |
|
|
|
|
|
log_update(f"Example raw download: asphericity\n{asph.head()}") |
|
log_update(f"\nCleaning Asphericity") |
|
asph = process_raw_albatross(asph) |
|
log_update(f"\nProcessed data: asphericity\n{asph.head()}") |
|
|
|
log_update(f"\nCleaning Scaled Re") |
|
scaled_re = process_raw_albatross(scaled_re) |
|
log_update(f"\nProcessed data: scaled re\n{scaled_re.head()}") |
|
|
|
log_update(f"\nCleaning Scaled Rg") |
|
scaled_rg = process_raw_albatross(scaled_rg) |
|
log_update(f"\nProcessed data: scaled rg\n{scaled_rg.head()}") |
|
|
|
log_update(f"\nCleaning Scaling Exp") |
|
scaling_exp = process_raw_albatross(scaling_exp) |
|
log_update(f"\nProcessed data: scaling exp\n{scaling_exp.head()}") |
|
|
|
|
|
log_update("\nStats:") |
|
log_update(f"# Asphericity sequences: {len(asph)}\n\tRange: {min(asph['Value']):.4f}-{max(asph['Value']):.4f}") |
|
log_update(f"# Scaled Re sequences: {len(scaled_re)}\n\tRange: {min(scaled_re['Value']):.4f}-{max(scaled_re['Value']):.4f}") |
|
log_update(f"# Scaled Rg sequences: {len(scaled_rg)}\n\tRange: {min(scaled_rg['Value']):.4f}-{max(scaled_rg['Value']):.4f}") |
|
log_update(f"# Scaling Exponent sequences: {len(scaling_exp)}\n\tRange: {min(scaling_exp['Value']):.4f}-{max(scaling_exp['Value']):.4f}") |
|
|
|
|
|
combined = combine_albatross_seqs(asph, scaled_re, scaled_rg, scaling_exp) |
|
|
|
|
|
proc_folder = "processed_data" |
|
os.makedirs(proc_folder,exist_ok=True) |
|
combined.to_csv(f"{proc_folder}/all_albatross_seqs_and_properties.csv",index=False) |
|
|
|
|
|
values_dict = { |
|
'Asphericity': asph['Value'].tolist(), |
|
'End-to-End Distance (Re)': scaled_re['Value'].tolist(), |
|
'Radius of Gyration (Rg)': scaled_rg['Value'].tolist(), |
|
'Scaling Exponent': scaling_exp['Value'].tolist() |
|
} |
|
train_test_values_dict = { |
|
'Asphericity': { |
|
'train': asph[asph['Split']=='Train']['Value'].tolist(), |
|
'test': asph[asph['Split']=='Test']['Value'].tolist()}, |
|
'End-to-End Distance (Re)': { |
|
'train': scaled_re[scaled_re['Split']=='Train']['Value'].tolist(), |
|
'test': scaled_re[scaled_re['Split']=='Test']['Value'].tolist()}, |
|
'Radius of Gyration (Rg)': { |
|
'train': scaled_rg[scaled_rg['Split']=='Train']['Value'].tolist(), |
|
'test': scaled_rg[scaled_rg['Split']=='Test']['Value'].tolist()}, |
|
'Scaling Exponent': { |
|
'train': scaling_exp[scaling_exp['Split']=='Train']['Value'].tolist(), |
|
'test': scaling_exp[scaling_exp['Split']=='Test']['Value'].tolist()}, |
|
} |
|
plot_all_values_hist_grid(values_dict, save_path="processed_data/value_histograms.png") |
|
plot_all_train_val_test_values_hist_grid(train_test_values_dict, save_path="processed_data/train_test_value_histograms.png") |
|
|
|
if __name__ == "__main__": |
|
main() |