import json import pandas as pd import tqdm import swifter from rdkit import Chem # Disable RDKit informational and warning messages from rdkit import RDLogger RDLogger.DisableLog('rdApp.*') PUBCHEM_DIR = # pubchem_path + 'pubchem24/' FSMOL_UID_PATH = # fsmol_path + '/fsmol/fsmol_train_accession_keys.json' PROT_CLASS_PATH = # chembl_path + 'chembl33/uniprot_pclass_mapping.csv' MHNFS_PATH = # mhnfs_path + '/mhnfs' import sys sys.path.append(MHNFS_PATH) from src.data_preprocessing.utils import Standardizer class PubChemFilter: def __init__(self, pubchem_dir, fsmol_uid_path, prot_class_path, mhnfs_path, debug = False): self.pubchem_dir = pubchem_dir self.fsmol_uid_path = fsmol_uid_path self.prot_class_path = prot_class_path self.mhnfs_path = mhnfs_path self.debug = debug def load_and_filter_assays(self): """ Load PubChem Assay data from file and filter them: 1. Drop all assays without protein accession keys 2. Drop all assays linked to multiple accession keys 3. Drop all assays with accession keys in FSmol training data Returns: df_assays (pd.Dataframe) """ print('Load assays...') df_assays = pd.read_table(f'{self.pubchem_dir}/bioassays.tsv.gz', usecols=['AID', 'UniProts IDs'] ).rename(columns={'UniProts IDs' : 'UID'}) # Load FSmol training data accession keys with open(self.fsmol_uid_path, 'r') as f: fs_train_targets = json.load(f).values() fs_train_targets = list(set([key for sublist in fs_train_targets for key in sublist])) print('Filter assays...') df_assays = df_assays.dropna(subset=['UID']) df_assays = df_assays[~df_assays['UID'].str.contains('\|')] df_assays = df_assays[~df_assays['UID'].str.contains('|'.join(fs_train_targets))] self.df_assays = df_assays def load_and_filter_bioactivities(self, chunk_size=10_000_000): """ Load bioactivity data in chucks and filter out datapoints with 1. assay not in aids 2. outcome not 'Active'/'Inactive' """ print('Load bioactivities...') aids = self.df_assays.AID.tolist() filtered_chunks = [] chunk_size = 10_000_000 for chunk in pd.read_csv(f'{self.pubchem_dir}/bioactivities.tsv.gz', sep='\t', chunksize=chunk_size, usecols=['AID', 'CID', 'Activity Outcome']): filtered_chunk = chunk[chunk['AID'].isin(aids)] filtered_chunk = filtered_chunk[filtered_chunk['Activity Outcome'].isin(['Inactive','Active'])] filtered_chunks.append(filtered_chunk) if self.debug: break # For debugging df_bio = pd.concat(filtered_chunks) df_bio = df_bio[df_bio.CID.notna()] df_bio['Activity'] = df_bio['Activity Outcome'].swifter.apply(lambda x : 1 if x == 'Active' else 0) self.df_bio = df_bio.drop('Activity Outcome', axis=1).astype(int) def merge_assay_and_activity_data(self): print('Merge...') self.df = self.df_bio.merge(self.df_assays, on='AID', how='left') convert_dict = {col: 'int32' if col != 'UID' else 'str' for col in self.df.columns } self.df = self.df.astype(convert_dict) del self.df_assays, self.df_bio def drop_hts_assays(self): print('Drop HTS assays...') aid_counts = self.df.groupby('AID').size() filtered_aids = aid_counts[aid_counts <= 100_000].index self.df = self.df[self.df['AID'].isin(filtered_aids)] def drop_targets_with_limited_data(self, na_min=50, ni_min=50): print('Drop targets with not enough datapoints...') unique_uids = self.df['UID'].sort_values().unique() # Sorted unique targets activity_counts = self.df.groupby('UID')['Activity'].value_counts().unstack().fillna(0) # matrix: rows=sorted targets, columns=nactive, ninactives mask = ((activity_counts[1] >= na_min) & (activity_counts[0] >= ni_min) ) # Both nactives and ninactives above nmin self.df = self.df[self.df['UID'].isin(unique_uids[mask])] def drop_conflicting_bioactivity_measures(self, target_col='UID', compound_col='CID'): """ Check if each target-compound pair is associated to an unique activity value, i.e. every measure either active or inactive. If not, drop it. """ def process_group(group): if group['Activity'].nunique() == 1: return group.head(1) else: return None print('Drop conflicting datapoints...') # Get unique UID-CID pairs and duplicated ones df_uniques = self.df.drop_duplicates(subset=[target_col, compound_col], keep=False) df_duplicates = self.df[~self.df.index.isin(df_uniques.index)] # Check duplicated pairs groups = df_duplicates.groupby([target_col, compound_col]) rows = [] for _, group in tqdm.tqdm(groups): rows.append(process_group(group)) df_rows = pd.concat([row for row in rows if row is not None]) self.df = pd.concat([df_uniques, df_rows]) def add_smiles(self, chunk_size=10_000_000): print('Retrieve SMILES...') cids = self.df.CID.astype(int).unique() filtered_chunks = [] for chunk in pd.read_table(f'{self.pubchem_dir}/smiles.tsv.gz', chunksize=chunk_size, names=['CID', 'SMILES']): filtered_chunk = chunk[chunk['CID'].isin(cids)] filtered_chunks.append(filtered_chunk) if self.debug: break df_smiles = pd.concat(filtered_chunks) def cleanup(smiles): sm = Standardizer(metal_disconnect=True, canon_taut=True) mol = Chem.MolFromSmiles(smiles) try: standardized_mol, _ = sm.standardize_mol(mol) return Chem.MolToSmiles(standardized_mol) except: print(smiles) return None df_smiles['SMILES'] = df_smiles['SMILES'].swifter.apply(lambda smi: cleanup(smi)) df_smiles.dropna(inplace=True) self.df = self.df.merge(df_smiles, on='CID', how='left').dropna(subset=['SMILES']) def print_stats(self): nassays = self.df['AID'].nunique() ntargets = self.df["UID"].nunique() ncompounds = self.df["CID"].nunique() nactvities = self.df.shape[0] print(f'{ntargets: >5,} targets | {nassays: >6,} assays | {ncompounds: >9,} compounds | {nactvities: >10,} activity data points') def save(self, fname='data/pubchem24_preprocessed.csv.gz'): print(f'Save to {fname}...') self.df.to_csv(fname, index=False) def load(self, fname): print(f'Load from {fname}...') self.df = pd.read_csv(fname) def add_protein_classifications(self): """ Retrieve protein classification """ print('Retrieve protein classifications...') protein_class = pd.read_csv(self.prot_class_path) print(protein_class) # protein_class['UID'] = protein_class['target_id'].swifter.apply(lambda x: x.split('_')[0]) self.df = self.df.merge(protein_class[['UID', 'Organism', 'L1', 'L2']], on='UID', how='left') if __name__ == '__main__': # Create an instance of PubChemFilter class pubchem_filter = PubChemFilter(PUBCHEM_DIR, FSMOL_UID_PATH, PROT_CLASS_PATH, MHNFS_PATH, False) # Call methods of the class as needed pubchem_filter.load_and_filter_assays() pubchem_filter.load_and_filter_bioactivities() pubchem_filter.merge_assay_and_activity_data() pubchem_filter.print_stats() pubchem_filter.drop_hts_assays() pubchem_filter.print_stats() pubchem_filter.drop_targets_with_limited_data() pubchem_filter.print_stats() pubchem_filter.drop_conflicting_bioactivity_measures() pubchem_filter.print_stats() pubchem_filter.drop_targets_with_limited_data() pubchem_filter.print_stats() pubchem_filter.add_smiles() pubchem_filter.print_stats() pubchem_filter.drop_conflicting_bioactivity_measures(compound_col='SMILES') pubchem_filter.print_stats() pubchem_filter.drop_targets_with_limited_data() pubchem_filter.print_stats() pubchem_filter.add_protein_classifications() pubchem_filter.save(fname='data/pubchem24/preprocessed.csv.gz')