mhnfs / pubchem_experiment /data_preprocess.py
Tschoui's picture
Upload 3 files
9afbc33 verified
import json
import pandas as pd
import tqdm
import swifter
from rdkit import Chem
# Disable RDKit informational and warning messages
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
PUBCHEM_DIR = # pubchem_path + 'pubchem24/'
FSMOL_UID_PATH = # fsmol_path + '/fsmol/fsmol_train_accession_keys.json'
PROT_CLASS_PATH = # chembl_path + 'chembl33/uniprot_pclass_mapping.csv'
MHNFS_PATH = # mhnfs_path + '/mhnfs'
import sys
sys.path.append(MHNFS_PATH)
from src.data_preprocessing.utils import Standardizer
class PubChemFilter:
def __init__(self, pubchem_dir, fsmol_uid_path, prot_class_path, mhnfs_path, debug = False):
self.pubchem_dir = pubchem_dir
self.fsmol_uid_path = fsmol_uid_path
self.prot_class_path = prot_class_path
self.mhnfs_path = mhnfs_path
self.debug = debug
def load_and_filter_assays(self):
"""
Load PubChem Assay data from file and filter them:
1. Drop all assays without protein accession keys
2. Drop all assays linked to multiple accession keys
3. Drop all assays with accession keys in FSmol training data
Returns:
df_assays (pd.Dataframe)
"""
print('Load assays...')
df_assays = pd.read_table(f'{self.pubchem_dir}/bioassays.tsv.gz', usecols=['AID', 'UniProts IDs'] ).rename(columns={'UniProts IDs' : 'UID'})
# Load FSmol training data accession keys
with open(self.fsmol_uid_path, 'r') as f:
fs_train_targets = json.load(f).values()
fs_train_targets = list(set([key for sublist in fs_train_targets for key in sublist]))
print('Filter assays...')
df_assays = df_assays.dropna(subset=['UID'])
df_assays = df_assays[~df_assays['UID'].str.contains('\|')]
df_assays = df_assays[~df_assays['UID'].str.contains('|'.join(fs_train_targets))]
self.df_assays = df_assays
def load_and_filter_bioactivities(self, chunk_size=10_000_000):
"""
Load bioactivity data in chucks and filter out datapoints with
1. assay not in aids
2. outcome not 'Active'/'Inactive'
"""
print('Load bioactivities...')
aids = self.df_assays.AID.tolist()
filtered_chunks = []
chunk_size = 10_000_000
for chunk in pd.read_csv(f'{self.pubchem_dir}/bioactivities.tsv.gz', sep='\t', chunksize=chunk_size, usecols=['AID', 'CID', 'Activity Outcome']):
filtered_chunk = chunk[chunk['AID'].isin(aids)]
filtered_chunk = filtered_chunk[filtered_chunk['Activity Outcome'].isin(['Inactive','Active'])]
filtered_chunks.append(filtered_chunk)
if self.debug:
break # For debugging
df_bio = pd.concat(filtered_chunks)
df_bio = df_bio[df_bio.CID.notna()]
df_bio['Activity'] = df_bio['Activity Outcome'].swifter.apply(lambda x : 1 if x == 'Active' else 0)
self.df_bio = df_bio.drop('Activity Outcome', axis=1).astype(int)
def merge_assay_and_activity_data(self):
print('Merge...')
self.df = self.df_bio.merge(self.df_assays, on='AID', how='left')
convert_dict = {col: 'int32' if col != 'UID' else 'str' for col in self.df.columns }
self.df = self.df.astype(convert_dict)
del self.df_assays, self.df_bio
def drop_hts_assays(self):
print('Drop HTS assays...')
aid_counts = self.df.groupby('AID').size()
filtered_aids = aid_counts[aid_counts <= 100_000].index
self.df = self.df[self.df['AID'].isin(filtered_aids)]
def drop_targets_with_limited_data(self, na_min=50, ni_min=50):
print('Drop targets with not enough datapoints...')
unique_uids = self.df['UID'].sort_values().unique() # Sorted unique targets
activity_counts = self.df.groupby('UID')['Activity'].value_counts().unstack().fillna(0) # matrix: rows=sorted targets, columns=nactive, ninactives
mask = ((activity_counts[1] >= na_min) & (activity_counts[0] >= ni_min) ) # Both nactives and ninactives above nmin
self.df = self.df[self.df['UID'].isin(unique_uids[mask])]
def drop_conflicting_bioactivity_measures(self, target_col='UID', compound_col='CID'):
"""
Check if each target-compound pair is associated to an unique activity value,
i.e. every measure either active or inactive. If not, drop it.
"""
def process_group(group):
if group['Activity'].nunique() == 1:
return group.head(1)
else:
return None
print('Drop conflicting datapoints...')
# Get unique UID-CID pairs and duplicated ones
df_uniques = self.df.drop_duplicates(subset=[target_col, compound_col], keep=False)
df_duplicates = self.df[~self.df.index.isin(df_uniques.index)]
# Check duplicated pairs
groups = df_duplicates.groupby([target_col, compound_col])
rows = []
for _, group in tqdm.tqdm(groups):
rows.append(process_group(group))
df_rows = pd.concat([row for row in rows if row is not None])
self.df = pd.concat([df_uniques, df_rows])
def add_smiles(self, chunk_size=10_000_000):
print('Retrieve SMILES...')
cids = self.df.CID.astype(int).unique()
filtered_chunks = []
for chunk in pd.read_table(f'{self.pubchem_dir}/smiles.tsv.gz', chunksize=chunk_size, names=['CID', 'SMILES']):
filtered_chunk = chunk[chunk['CID'].isin(cids)]
filtered_chunks.append(filtered_chunk)
if self.debug:
break
df_smiles = pd.concat(filtered_chunks)
def cleanup(smiles):
sm = Standardizer(metal_disconnect=True, canon_taut=True)
mol = Chem.MolFromSmiles(smiles)
try:
standardized_mol, _ = sm.standardize_mol(mol)
return Chem.MolToSmiles(standardized_mol)
except:
print(smiles)
return None
df_smiles['SMILES'] = df_smiles['SMILES'].swifter.apply(lambda smi: cleanup(smi))
df_smiles.dropna(inplace=True)
self.df = self.df.merge(df_smiles, on='CID', how='left').dropna(subset=['SMILES'])
def print_stats(self):
nassays = self.df['AID'].nunique()
ntargets = self.df["UID"].nunique()
ncompounds = self.df["CID"].nunique()
nactvities = self.df.shape[0]
print(f'{ntargets: >5,} targets | {nassays: >6,} assays | {ncompounds: >9,} compounds | {nactvities: >10,} activity data points')
def save(self, fname='data/pubchem24_preprocessed.csv.gz'):
print(f'Save to {fname}...')
self.df.to_csv(fname, index=False)
def load(self, fname):
print(f'Load from {fname}...')
self.df = pd.read_csv(fname)
def add_protein_classifications(self):
"""
Retrieve protein classification
"""
print('Retrieve protein classifications...')
protein_class = pd.read_csv(self.prot_class_path)
print(protein_class)
# protein_class['UID'] = protein_class['target_id'].swifter.apply(lambda x: x.split('_')[0])
self.df = self.df.merge(protein_class[['UID', 'Organism', 'L1', 'L2']], on='UID', how='left')
if __name__ == '__main__':
# Create an instance of PubChemFilter class
pubchem_filter = PubChemFilter(PUBCHEM_DIR, FSMOL_UID_PATH, PROT_CLASS_PATH, MHNFS_PATH, False)
# Call methods of the class as needed
pubchem_filter.load_and_filter_assays()
pubchem_filter.load_and_filter_bioactivities()
pubchem_filter.merge_assay_and_activity_data()
pubchem_filter.print_stats()
pubchem_filter.drop_hts_assays()
pubchem_filter.print_stats()
pubchem_filter.drop_targets_with_limited_data()
pubchem_filter.print_stats()
pubchem_filter.drop_conflicting_bioactivity_measures()
pubchem_filter.print_stats()
pubchem_filter.drop_targets_with_limited_data()
pubchem_filter.print_stats()
pubchem_filter.add_smiles()
pubchem_filter.print_stats()
pubchem_filter.drop_conflicting_bioactivity_measures(compound_col='SMILES')
pubchem_filter.print_stats()
pubchem_filter.drop_targets_with_limited_data()
pubchem_filter.print_stats()
pubchem_filter.add_protein_classifications()
pubchem_filter.save(fname='data/pubchem24/preprocessed.csv.gz')