Spaces:

ml-jku
/

mhnfs

Running

File size: 8,445 Bytes

9afbc33

import json
import pandas as pd
import tqdm
import swifter
from rdkit import Chem

# Disable RDKit informational and warning messages
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

PUBCHEM_DIR = # pubchem_path + 'pubchem24/'
FSMOL_UID_PATH = # fsmol_path + '/fsmol/fsmol_train_accession_keys.json'
PROT_CLASS_PATH = # chembl_path + 'chembl33/uniprot_pclass_mapping.csv'
MHNFS_PATH = # mhnfs_path +  '/mhnfs'

import sys
sys.path.append(MHNFS_PATH)
from src.data_preprocessing.utils import Standardizer

class PubChemFilter:

    def __init__(self, pubchem_dir, fsmol_uid_path, prot_class_path, mhnfs_path, debug = False):
        self.pubchem_dir = pubchem_dir
        self.fsmol_uid_path = fsmol_uid_path
        self.prot_class_path = prot_class_path
        self.mhnfs_path = mhnfs_path
        self.debug = debug

    def load_and_filter_assays(self):
        """
        Load PubChem Assay data from file and filter them:
        1. Drop all assays without protein accession keys
        2. Drop all assays linked to multiple accession keys
        3. Drop all assays with accession keys in FSmol training data

        Returns:
            df_assays (pd.Dataframe)
        """

        print('Load assays...')
        df_assays = pd.read_table(f'{self.pubchem_dir}/bioassays.tsv.gz', usecols=['AID', 'UniProts IDs'] ).rename(columns={'UniProts IDs' : 'UID'})

        # Load FSmol training data accession keys
        with open(self.fsmol_uid_path, 'r') as f:
            fs_train_targets = json.load(f).values()
        fs_train_targets = list(set([key for sublist in fs_train_targets for key in sublist]))     

        print('Filter assays...')
        df_assays = df_assays.dropna(subset=['UID'])
        df_assays = df_assays[~df_assays['UID'].str.contains('\|')]
        df_assays = df_assays[~df_assays['UID'].str.contains('|'.join(fs_train_targets))] 
        self.df_assays = df_assays
    
    def load_and_filter_bioactivities(self, chunk_size=10_000_000):
        """
        Load bioactivity data in chucks and filter out datapoints with 
        1. assay not in aids
        2. outcome not 'Active'/'Inactive'
        """

        print('Load bioactivities...')
        aids = self.df_assays.AID.tolist()
        filtered_chunks = []
        chunk_size = 10_000_000  
        for chunk in pd.read_csv(f'{self.pubchem_dir}/bioactivities.tsv.gz', sep='\t', chunksize=chunk_size, usecols=['AID', 'CID', 'Activity Outcome']):
            filtered_chunk = chunk[chunk['AID'].isin(aids)]
            filtered_chunk = filtered_chunk[filtered_chunk['Activity Outcome'].isin(['Inactive','Active'])]
            filtered_chunks.append(filtered_chunk)
            if self.debug:
                break # For debugging
        df_bio = pd.concat(filtered_chunks)
        df_bio = df_bio[df_bio.CID.notna()]
        df_bio['Activity'] = df_bio['Activity Outcome'].swifter.apply(lambda x : 1 if x == 'Active' else 0)
        self.df_bio = df_bio.drop('Activity Outcome', axis=1).astype(int)

    def merge_assay_and_activity_data(self):
        print('Merge...')
        self.df = self.df_bio.merge(self.df_assays, on='AID', how='left')  
        convert_dict = {col: 'int32' if col != 'UID' else 'str' for col in self.df.columns }
        self.df = self.df.astype(convert_dict)
        del self.df_assays, self.df_bio

    def drop_hts_assays(self):
        print('Drop HTS assays...')
        aid_counts = self.df.groupby('AID').size()
        filtered_aids = aid_counts[aid_counts <= 100_000].index
        self.df = self.df[self.df['AID'].isin(filtered_aids)]

    def drop_targets_with_limited_data(self, na_min=50, ni_min=50):
        print('Drop targets with not enough datapoints...')
        unique_uids = self.df['UID'].sort_values().unique() # Sorted unique targets
        activity_counts = self.df.groupby('UID')['Activity'].value_counts().unstack().fillna(0) # matrix: rows=sorted targets, columns=nactive, ninactives
        mask = ((activity_counts[1] >= na_min) & (activity_counts[0] >= ni_min) ) # Both nactives and ninactives above nmin
        self.df = self.df[self.df['UID'].isin(unique_uids[mask])]

    def drop_conflicting_bioactivity_measures(self, target_col='UID', compound_col='CID'):
        """
        Check if each target-compound pair is associated to an unique activity value,
        i.e. every measure either active or inactive. If not, drop it.
        """

        def process_group(group):
            if group['Activity'].nunique() == 1:
                return group.head(1)
            else:
                return None
            
        print('Drop conflicting datapoints...')
        # Get unique UID-CID pairs and duplicated ones    
        df_uniques = self.df.drop_duplicates(subset=[target_col, compound_col], keep=False)
        df_duplicates = self.df[~self.df.index.isin(df_uniques.index)]
            
        # Check duplicated pairs
        groups = df_duplicates.groupby([target_col, compound_col])
        rows = []
        for _, group in tqdm.tqdm(groups):
            rows.append(process_group(group))
        df_rows = pd.concat([row for row in rows if row is not None])    
        self.df = pd.concat([df_uniques, df_rows])

    def add_smiles(self, chunk_size=10_000_000):
        print('Retrieve SMILES...')
        cids = self.df.CID.astype(int).unique()
        filtered_chunks = []
        for chunk in pd.read_table(f'{self.pubchem_dir}/smiles.tsv.gz', chunksize=chunk_size, names=['CID', 'SMILES']):
            filtered_chunk = chunk[chunk['CID'].isin(cids)]
            filtered_chunks.append(filtered_chunk)
            if self.debug:
                break
        df_smiles = pd.concat(filtered_chunks) 

        def cleanup(smiles):
            sm = Standardizer(metal_disconnect=True, canon_taut=True)
            mol = Chem.MolFromSmiles(smiles)
            try:
                standardized_mol, _ = sm.standardize_mol(mol)
                return Chem.MolToSmiles(standardized_mol)
            except:
                print(smiles)
                return None
    
        df_smiles['SMILES'] = df_smiles['SMILES'].swifter.apply(lambda smi: cleanup(smi))
        df_smiles.dropna(inplace=True)
        
        self.df = self.df.merge(df_smiles, on='CID', how='left').dropna(subset=['SMILES']) 

    def print_stats(self):
        nassays = self.df['AID'].nunique()
        ntargets = self.df["UID"].nunique()
        ncompounds = self.df["CID"].nunique()
        nactvities = self.df.shape[0]
        print(f'{ntargets: >5,} targets | {nassays: >6,} assays | {ncompounds: >9,} compounds | {nactvities: >10,} activity data points')

    def save(self, fname='data/pubchem24_preprocessed.csv.gz'):
        print(f'Save to {fname}...')
        self.df.to_csv(fname, index=False)

    def load(self, fname):
        print(f'Load from {fname}...')
        self.df = pd.read_csv(fname)

    def add_protein_classifications(self):
        """
        Retrieve protein classification
        """
        print('Retrieve protein classifications...')
        protein_class = pd.read_csv(self.prot_class_path)
        print(protein_class)
        # protein_class['UID'] = protein_class['target_id'].swifter.apply(lambda x: x.split('_')[0])
        self.df = self.df.merge(protein_class[['UID', 'Organism', 'L1', 'L2']], on='UID', how='left')    

if __name__ == '__main__':
    # Create an instance of PubChemFilter class
    pubchem_filter = PubChemFilter(PUBCHEM_DIR, FSMOL_UID_PATH, PROT_CLASS_PATH, MHNFS_PATH, False)

    # Call methods of the class as needed
    pubchem_filter.load_and_filter_assays()
    pubchem_filter.load_and_filter_bioactivities()
    pubchem_filter.merge_assay_and_activity_data()
    pubchem_filter.print_stats()
    pubchem_filter.drop_hts_assays()
    pubchem_filter.print_stats()
    pubchem_filter.drop_targets_with_limited_data()
    pubchem_filter.print_stats()
    pubchem_filter.drop_conflicting_bioactivity_measures()
    pubchem_filter.print_stats()
    pubchem_filter.drop_targets_with_limited_data()
    pubchem_filter.print_stats()
    pubchem_filter.add_smiles()
    pubchem_filter.print_stats()
    pubchem_filter.drop_conflicting_bioactivity_measures(compound_col='SMILES')
    pubchem_filter.print_stats()
    pubchem_filter.drop_targets_with_limited_data()
    pubchem_filter.print_stats()
    pubchem_filter.add_protein_classifications()
    pubchem_filter.save(fname='data/pubchem24/preprocessed.csv.gz')