File size: 17,711 Bytes

e048d40

import pandas as pd
import numpy as np 
import os
from fuson_plm.utils.logging import open_logfile, log_update
from fuson_plm.utils.constants import DELIMITERS, VALID_AAS
from fuson_plm.utils.data_cleaning import check_columns_for_listlike, find_invalid_chars
from fuson_plm.benchmarking.idr_prediction.plot import plot_all_values_hist_grid, plot_all_train_val_test_values_hist_grid

def process_raw_albatross(df):
    # return a version of the df with first column split, duplicates cleaned,columns checked for weird characters and invalids
    
    # first, look at the splits
    split_str = df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'})
    tot_prots = sum(split_str['count'])
    split_str['pcnt'] = round(100*split_str['count']/tot_prots,2)
    split_str = split_str.to_string(index=False)
    split_str = "\t\t" + split_str.replace("\n","\n\t\t")
    log_update(f"\tTotal proteins: {tot_prots}\n\tSplits:\n{split_str}")
    
    # format: IDR_19076_tr___A0A8M9PNM5___A0A8M9PNM5_DANRE
    # or: synth_test_sequence0
    df['temp'] = df['ID'].str.split("_")
    df['ID'] = df['temp'].apply(lambda x: f"{x[0]}" if len(x)==1 else f"{x[0]}_{x[1]}" if len(x)<3 else f"{x[0]}_{x[1]}_{x[2]}")
    # Not ever column has UniProt IDs and Names, so we have to allow np.nan if this info is missing.
    df['UniProt_ID'] = df['temp'].apply(lambda x: x[5].strip() if len(x)>=5 else np.nan)
    df['UniProt_Name'] = df['temp'].apply(lambda x: f"{x[8].strip()}_{x[9].strip()}" if len(x)>=8 else np.nan)
    df = df.drop(columns=['temp'])
    
    cols_to_check = list(df.columns)
    cols_to_check.remove('Value')   # don't check this one because it shouldn't be string
    # Investigate the colimns we just created and make sure they don't have any invalid features. 
    # make sure value is float type
    assert df['Value'].dtype == 'float64'
    check_columns_for_listlike(df, cols_of_interest=cols_to_check, delimiters=DELIMITERS)
    
    # Check for invalid AAs
    df['invalid_chars'] = df['Sequence'].apply(lambda x: find_invalid_chars(x, VALID_AAS))
    df[df['invalid_chars'].str.len()>0].sort_values(by='Sequence')
    all_invalid_chars = set().union(*df['invalid_chars'])
    log_update(f"\tchecking for invalid characters...\n\t\tset of all invalid characters discovered within train_df: {all_invalid_chars}")

    # Assert no invalid AAs 
    assert (df['invalid_chars'].str.len()==0).all()
    df = df.drop(columns=['invalid_chars'])
    
    # Check for duplicates - if we find any, REMOVE them from train and keep them in test
    duplicates = df[df.duplicated('Sequence')]['Sequence'].unique().tolist()
    n_rows_with_duplicates = len(df[df['Sequence'].isin(duplicates)])
    log_update(f"\t{len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} rows")
    
    # Look for distribution of duplicates WITHIN train, WITHIN test, and BETWEEN train and test
    # Train only
    duplicates = df.loc[
            (df['Split']=='Train')
        ]
    duplicates = duplicates[duplicates.duplicated('Sequence')]['Sequence'].unique().tolist()
    n_rows_with_duplicates = len(df.loc[
            (df['Sequence'].isin(duplicates)) &
            (df['Split']=='Train')
        ])
    log_update(f"\t\twithin TRAIN only: {len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} Train rows")
    
    # Test only
    duplicates = df.loc[
            (df['Split']=='Test')
    ]
    duplicates = duplicates[duplicates.duplicated('Sequence')]['Sequence'].unique().tolist()
    n_rows_with_duplicates = len(df.loc[
            (df['Sequence'].isin(duplicates)) &
            (df['Split']=='Test')
        ])
    log_update(f"\t\twithin TEST only: {len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} Test rows")
    
    # Between train and test
    duplicates_df = df.groupby('Sequence').agg({
        'Split': lambda x: ','.join(set(x))
    }).reset_index()
    duplicates_df = duplicates_df.loc[duplicates_df['Split'].str.contains(',')].reset_index(drop=True)
    duplicates = duplicates_df['Sequence'].unique().tolist()
    n_rows_with_duplicates = len(df[df['Sequence'].isin(duplicates)])
    log_update(f"\t\tduplicates in BOTH TRAIN AND TEST: {len(duplicates)} duplicated sequences, corresponding to {n_rows_with_duplicates} rows")
    log_update(f"\t\tprinting portion of dataframe with train+test shared seqs:\n{duplicates_df.head(5)}")
    
    log_update("\tGrouping by sequence, averaging values, and keeping any Train/Test duplicates in the Test set...")
    df = df.replace(np.nan, '')
    df = df.groupby('Sequence').agg(
        Value=('Value', 'mean'),
        Value_STD=('Value', 'std'),
        IDs=('ID', lambda x: ','.join(x)),
        UniProt_IDs=('UniProt_ID', lambda x: ','.join(x)),
        UniProt_Names=('UniProt_Name', lambda x: ','.join(x)),
        Split=('Split', lambda x: ','.join(x))
    ).reset_index()
    for col in ['IDs','UniProt_IDs','UniProt_Names','Split']:
        df[col] = df[col].apply(lambda x: [y for y in x.split(',')])
        df[col] = df[col].apply(lambda x: ','.join(x))
        df[col] = df[col].str.strip(',')
        # make sure there are no commas left 
        assert len(df[df[col].str.contains(',,')])==0
    # set Split to Test if test is in it
    df['Split'] = df['Split'].apply(lambda x: 'Test' if 'Test' in x else 'Train')    
        
    # For anything that wasn't duplicated, Value_STD is nan
    log_update("\tChecking coefficients of variation for averaged rows")
    # calculate coefficient of variation, should be < 10
    df['Value_CV'] = 100*df['Value_STD']/df['Value']
    log_update(f"\t\tTotal rows with coefficient of variation (CV)\n\t\t\t<=10%: {len(df[df['Value_CV']<=10])}\n\t\t\t>10%: {len(df[df['Value_CV']>10])}\n\t\t\t>20%: {len(df[df['Value_CV']>20])}")
    
    # Ensure there are no duplicates
    assert len(df[df['Sequence'].duplicated()])==0
    log_update(f"\tNo remaining duplicates: {len(df[df['Sequence'].duplicated()])==0}")
    
    # Print the final distribution of train and test values
    split_str = df['Split'].value_counts().reset_index().rename(columns={'index': 'Split','Split': 'count'})
    tot_prots = sum(split_str['count'])
    split_str['pcnt'] = round(100*split_str['count']/tot_prots,2)
    split_str = split_str.to_string(index=False)
    split_str = "\t\t" + split_str.replace("\n","\n\t\t")
    log_update(f"\tTotal proteins: {tot_prots}\n\tSplits:\n{split_str}")

    return df

def combine_albatross_seqs(asph, scaled_re, scaled_rg, scaling_exp):
    log_update("\nCombining all four dataframes into one file of ALBATROSS sequences")
    
    asph = asph[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'asph'})
    scaled_re = scaled_re[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'scaled_re'})
    scaled_rg = scaled_rg[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'scaled_rg'})
    scaling_exp = scaling_exp[['Sequence','Value','IDs','UniProt_IDs','UniProt_Names','Split']].rename(columns={'Value':'scaling_exp'})

    combined = asph.merge(scaled_re, on='Sequence',how='outer',suffixes=('_asph', '_scaledre'))\
        .merge(scaled_rg, on='Sequence',how='outer',suffixes=('_scaledre', '_scaledrg'))\
        .merge(scaling_exp, on='Sequence',how='outer',suffixes=('_scaledrg', '_scalingexp')).fillna('')
        
    # Make sure something that's in train for one is in train for all, and not test
    combined['IDs'] = combined['IDs_asph']+','+combined['IDs_scaledre']+','+combined['IDs_scaledrg']+','+combined['IDs_scalingexp']
    combined['UniProt_IDs'] = combined['UniProt_IDs_asph']+','+combined['UniProt_IDs_scaledre']+','+combined['UniProt_IDs_scaledrg']+','+combined['UniProt_IDs_scalingexp']
    combined['UniProt_Names'] = combined['UniProt_Names_asph']+','+combined['UniProt_Names_scaledre']+','+combined['UniProt_Names_scaledrg']+','+combined['UniProt_Names_scalingexp']
    combined['Split'] = combined['Split_asph']+','+combined['Split_scaledre']+','+combined['Split_scaledrg']+','+combined['Split_scalingexp']
    
    # Make the lists clean
    for col in ['IDs','UniProt_IDs','UniProt_Names','Split']:
        combined[col] = combined[col].apply(lambda x: [y.strip() for y in x.split(',') if len(y)>0])
        combined[col] = combined[col].apply(lambda x: ','.join(set(x)))
        combined[col] = combined[col].str.strip(',')
        # make sure there are no commas left 
        assert len(combined[combined[col].str.contains(',,')])==0
    combined = combined[['Sequence','IDs','UniProt_IDs','UniProt_Names','Split','asph','scaled_re','scaled_rg','scaling_exp']]   # drop unneeded merge relics
    combined = combined.replace('',np.nan)
    # Make sure there are no sequences where split is both train and test
    log_update("\tChecking for any cases where a protein is Train for one IDR prediction task and Test for another (should NOT happen!)")
    duplicates_df = combined.groupby('Sequence').agg({
        'Split': lambda x: ','.join(set(x))
    }).reset_index()
    duplicates_df = duplicates_df.loc[duplicates_df['Split'].str.contains(',')].reset_index(drop=True)
    duplicates = duplicates_df['Sequence'].unique().tolist()
    n_rows_with_duplicates = len(combined[combined['Sequence'].isin(duplicates)])
    log_update(f"\t\tsequences in BOTH TRAIN AND TEST: {len(duplicates)} sequences, corresponding to {n_rows_with_duplicates} rows")
    if len(duplicates)>0:
        log_update(f"\t\tprinting portion of assert len(combined[combined['asph'].notna()])==len(asph)dataframe with train+test shared seqs:\n{duplicates_df.head(5)}")
    
    # Now, get rid of duplicates
    combined = combined.drop_duplicates().reset_index(drop=True)
    duplicates = combined[combined.duplicated('Sequence')]['Sequence'].unique().tolist()
    log_update(f"\tDropped duplicates.\n\tTotal duplicate sequences: {len(duplicates)}\n\tTotal sequences: {len(combined)}")
    assert len(duplicates)==0
    
    # See how many columns have multiple entries for each
    log_update(f"\tChecking how many sequences have multiple of the following: ID, UniProt ID, UniProt Name")
    for col in ['IDs','UniProt_IDs','UniProt_Names','Split']:
        n_multiple = len(combined.loc[(combined[col].notna()) & (combined[col].str.contains(','))])
        log_update(f"\t\t{col}: {n_multiple}")
    
    # See how many entries there are of each cproperty (should match length of original database)
    assert len(combined[combined['asph'].notna()])==len(asph)
    assert len(combined[combined['scaled_re'].notna()])==len(scaled_re)
    assert len(combined[combined['scaled_rg'].notna()])==len(scaled_rg)
    assert len(combined[combined['scaling_exp'].notna()])==len(scaling_exp)
    log_update("\tSequences with values for each property:")
    for property in ['asph','scaled_re','scaled_rg','scaling_exp']:
        log_update(f"\t\t{property}: {len(combined[combined[property].notna()])}")
    
    log_update(f"\nPreview of combined database with columns: {combined.columns}\n{combined.head(10)}")
    return combined

def main():
    with open_logfile("data_cleaning_log.txt"):
        # Read in all of the raw data
        raw_data_folder = 'raw_data'
        dtype_dict = {0:str,1:str,2:float}
        rename_dict = {0:'ID',1:'Sequence',2:'Value'}
        
        # Read in the test data
        asph_test = pd.read_csv(f"{raw_data_folder}/asph_nat_meth_test.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict)
        scaled_re_test = pd.read_csv(f"{raw_data_folder}/scaled_re_nat_meth_test.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict)
        scaled_rg_test = pd.read_csv(f"{raw_data_folder}/scaled_rg_nat_meth_test.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict)
        scaling_exp_test = pd.read_csv(f"{raw_data_folder}/scaling_exp_nat_meth_test.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict)
        
        # Read in the train data
        asph_train = pd.read_csv(f"{raw_data_folder}/asph_bio_synth_training_data_cleaned_05_09_2023.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict)
        scaled_re_train = pd.read_csv(f"{raw_data_folder}/scaled_re_bio_synth_training_data_cleaned_05_09_2023.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict)
        scaled_rg_train = pd.read_csv(f"{raw_data_folder}/scaled_rg_bio_synth_training_data_cleaned_05_09_2023.tsv",sep="\t",dtype=dtype_dict,header=None).rename(columns=rename_dict)
        scaling_exp_train = pd.read_csv(f"{raw_data_folder}/scaling_exp_bio_synth_training_data_cleaned_05_09_2023.tsv",sep=" ",dtype=dtype_dict,header=None).rename(columns=rename_dict)
        
        # Concatenate - include columns for split
        asph_test['Split'] = ['Test']*len(asph_test)
        scaled_re_test['Split'] = ['Test']*len(scaled_re_test)
        scaled_rg_test['Split'] = ['Test']*len(scaled_rg_test)
        scaling_exp_test['Split'] = ['Test']*len(scaling_exp_test)

        asph_train['Split'] = ['Train']*len(asph_train)
        scaled_re_train['Split'] = ['Train']*len(scaled_re_train)
        scaled_rg_train['Split'] = ['Train']*len(scaled_rg_train)
        scaling_exp_train['Split'] = ['Train']*len(scaling_exp_train)
        
        asph = pd.concat([asph_test, asph_train])
        scaled_re = pd.concat([scaled_re_test, scaled_re_train])
        scaled_rg = pd.concat([scaled_rg_test, scaled_rg_train])
        scaling_exp = pd.concat([scaling_exp_test, scaling_exp_train])
        
        log_update("Initial counts:")
        log_update(f"\tAsphericity: total entries={len(asph)}, not nan entries={len(asph.loc[asph['Value'].notna()])}")
        log_update(f"\tScaled re: total entries={len(scaled_re)}, not nan entries={len(scaled_re.loc[scaled_re['Value'].notna()])}")
        log_update(f"\tScaled rg: total entries={len(scaled_rg)}, not nan entries={len(scaled_rg.loc[scaled_rg['Value'].notna()])}")
        # change any scaled_rg rows with values less than 1 to np.nan, as done in the paper
        scaled_rg = scaled_rg.loc[
            scaled_rg['Value']>=1].reset_index(drop=True)
        log_update(f"\t\tAfter dropping Rg values < 1: total entries={len(scaled_rg)}")
        log_update(f"\tScaling exp: total entries={len(scaling_exp)}, not nan entries={len(scaling_exp.loc[scaling_exp['Value'].notna()])}")
    
        # Process the raw data
        log_update(f"Example raw download: asphericity\n{asph.head()}")
        log_update(f"\nCleaning Asphericity")
        asph = process_raw_albatross(asph)
        log_update(f"\nProcessed data: asphericity\n{asph.head()}")
        
        log_update(f"\nCleaning Scaled Re")
        scaled_re = process_raw_albatross(scaled_re)
        log_update(f"\nProcessed data: scaled re\n{scaled_re.head()}")
        
        log_update(f"\nCleaning Scaled Rg")
        scaled_rg = process_raw_albatross(scaled_rg)
        log_update(f"\nProcessed data: scaled rg\n{scaled_rg.head()}")

        log_update(f"\nCleaning Scaling Exp")
        scaling_exp = process_raw_albatross(scaling_exp)
        log_update(f"\nProcessed data: scaling exp\n{scaling_exp.head()}")
        
        # Give some stats about each dataset
        log_update("\nStats:")
        log_update(f"# Asphericity sequences: {len(asph)}\n\tRange: {min(asph['Value']):.4f}-{max(asph['Value']):.4f}")
        log_update(f"# Scaled Re sequences: {len(scaled_re)}\n\tRange: {min(scaled_re['Value']):.4f}-{max(scaled_re['Value']):.4f}")
        log_update(f"# Scaled Rg sequences: {len(scaled_rg)}\n\tRange: {min(scaled_rg['Value']):.4f}-{max(scaled_rg['Value']):.4f}")
        log_update(f"# Scaling Exponent sequences: {len(scaling_exp)}\n\tRange: {min(scaling_exp['Value']):.4f}-{max(scaling_exp['Value']):.4f}")
        
        # Combine
        combined = combine_albatross_seqs(asph, scaled_re, scaled_rg, scaling_exp)
        
        # Save processed data
        proc_folder = "processed_data"
        os.makedirs(proc_folder,exist_ok=True)
        combined.to_csv(f"{proc_folder}/all_albatross_seqs_and_properties.csv",index=False)
        
        # Plot the data distribution and save it
        values_dict =  {
            'Asphericity': asph['Value'].tolist(),
            'End-to-End Distance (Re)': scaled_re['Value'].tolist(),
            'Radius of Gyration (Rg)': scaled_rg['Value'].tolist(),
            'Scaling Exponent': scaling_exp['Value'].tolist()
        }
        train_test_values_dict = {
            'Asphericity': {
                'train': asph[asph['Split']=='Train']['Value'].tolist(),
                'test': asph[asph['Split']=='Test']['Value'].tolist()},
            'End-to-End Distance (Re)': {
                'train': scaled_re[scaled_re['Split']=='Train']['Value'].tolist(),
                'test': scaled_re[scaled_re['Split']=='Test']['Value'].tolist()},
            'Radius of Gyration (Rg)': {
                'train': scaled_rg[scaled_rg['Split']=='Train']['Value'].tolist(),
                'test': scaled_rg[scaled_rg['Split']=='Test']['Value'].tolist()},
            'Scaling Exponent': {
                'train': scaling_exp[scaling_exp['Split']=='Train']['Value'].tolist(),
                'test': scaling_exp[scaling_exp['Split']=='Test']['Value'].tolist()},  
        }
        plot_all_values_hist_grid(values_dict, save_path="processed_data/value_histograms.png")
        plot_all_train_val_test_values_hist_grid(train_test_values_dict, save_path="processed_data/train_test_value_histograms.png")

if __name__ == "__main__":
    main()